Upload folder using huggingface_hub
Browse files- checkpoints/.hydra/config.yaml +49 -0
- checkpoints/.hydra/hydra.yaml +156 -0
- checkpoints/.hydra/overrides.yaml +1 -0
- checkpoints/checkpoint-pt-10000/model.safetensors +3 -0
- checkpoints/checkpoint-pt-10000/random_states_0.pkl +3 -0
- checkpoints/checkpoint-pt-5000/model.safetensors +3 -0
- checkpoints/checkpoint-pt-5000/random_states_0.pkl +3 -0
- checkpoints/config.json +33 -0
- checkpoints/main.log +246 -0
checkpoints/.hydra/config.yaml
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mode: pt
|
2 |
+
device: gpu
|
3 |
+
precision: bf16
|
4 |
+
eval_only: false
|
5 |
+
predict_only: false
|
6 |
+
seed: 80085
|
7 |
+
model:
|
8 |
+
klass: local_t5
|
9 |
+
name: pszemraj/tFINE-base-65kBPE-FLAN
|
10 |
+
overwrite:
|
11 |
+
dropout_rate: 0.0
|
12 |
+
add_config:
|
13 |
+
is_bf16: false
|
14 |
+
checkpoint_path: ''
|
15 |
+
random_init: true
|
16 |
+
compile: true
|
17 |
+
tokenizer:
|
18 |
+
name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
|
19 |
+
data:
|
20 |
+
input_length: 1024
|
21 |
+
mlm_probability: 0.15
|
22 |
+
mean_noise_span_length: 3.0
|
23 |
+
num_workers: 8
|
24 |
+
optim:
|
25 |
+
name: adamwscale
|
26 |
+
base_lr: 0.008
|
27 |
+
batch_size: 120
|
28 |
+
total_steps: 80000
|
29 |
+
epochs: -1
|
30 |
+
warmup_steps: 10000
|
31 |
+
lr_scheduler: cosine
|
32 |
+
weight_decay: 0.0001
|
33 |
+
grad_clip: 1.0
|
34 |
+
grad_acc: 24
|
35 |
+
final_cosine: 1.0e-05
|
36 |
+
eval:
|
37 |
+
every_steps: 100000
|
38 |
+
steps: 500
|
39 |
+
checkpoint:
|
40 |
+
every_steps: 5000
|
41 |
+
logging:
|
42 |
+
neptune: false
|
43 |
+
neptune_creds:
|
44 |
+
project: null
|
45 |
+
api_token: null
|
46 |
+
tags: ''
|
47 |
+
every_steps: 50
|
48 |
+
grad_l2: true
|
49 |
+
weights_l2: true
|
checkpoints/.hydra/hydra.yaml
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}-${logging.neptune_creds.tags}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: main
|
117 |
+
chdir: true
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: default
|
122 |
+
env_set: {}
|
123 |
+
env_copy: []
|
124 |
+
config:
|
125 |
+
override_dirname:
|
126 |
+
kv_sep: '='
|
127 |
+
item_sep: ','
|
128 |
+
exclude_keys: []
|
129 |
+
runtime:
|
130 |
+
version: 1.3.2
|
131 |
+
version_base: '1.1'
|
132 |
+
cwd: /workspace/nanoT5
|
133 |
+
config_sources:
|
134 |
+
- path: hydra.conf
|
135 |
+
schema: pkg
|
136 |
+
provider: hydra
|
137 |
+
- path: /workspace/nanoT5/nanoT5/configs
|
138 |
+
schema: file
|
139 |
+
provider: main
|
140 |
+
- path: ''
|
141 |
+
schema: structured
|
142 |
+
provider: schema
|
143 |
+
output_dir: /workspace/nanoT5/logs/2024-08-09/08-30-29-
|
144 |
+
choices:
|
145 |
+
local_env: default
|
146 |
+
task: pt
|
147 |
+
hydra/env: default
|
148 |
+
hydra/callbacks: null
|
149 |
+
hydra/job_logging: default
|
150 |
+
hydra/hydra_logging: default
|
151 |
+
hydra/hydra_help: default
|
152 |
+
hydra/help: default
|
153 |
+
hydra/sweeper: basic
|
154 |
+
hydra/launcher: basic
|
155 |
+
hydra/output: default
|
156 |
+
verbose: false
|
checkpoints/.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
checkpoints/checkpoint-pt-10000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f49fbb3e36013bc83be014866e4f85ff9bb2334b4c0c1154d411c64e5324b19
|
3 |
+
size 1202681712
|
checkpoints/checkpoint-pt-10000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:634ae87ad9ec14553a807f970f4e595e3fef7b62fd4afaddf671a76426ff94ed
|
3 |
+
size 14344
|
checkpoints/checkpoint-pt-5000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60b2bfff7bec1e5e4cb66aa287758e4728355ecefceca151a67ec45441547613
|
3 |
+
size 1202681712
|
checkpoints/checkpoint-pt-5000/random_states_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:634ae87ad9ec14553a807f970f4e595e3fef7b62fd4afaddf671a76426ff94ed
|
3 |
+
size 14344
|
checkpoints/config.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "pszemraj/tFINE-base-65kBPE-FLAN",
|
3 |
+
"architectures": [
|
4 |
+
"T5ForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"classifier_dropout": 0.0,
|
7 |
+
"d_ff": 2560,
|
8 |
+
"d_kv": 64,
|
9 |
+
"d_model": 768,
|
10 |
+
"decoder_start_token_id": 3,
|
11 |
+
"dense_act_fn": "silu",
|
12 |
+
"dropout_rate": 0.0,
|
13 |
+
"eos_token_id": 2,
|
14 |
+
"feed_forward_proj": "gated-silu",
|
15 |
+
"initializer_factor": 1.0,
|
16 |
+
"is_bf16": true,
|
17 |
+
"is_encoder_decoder": false,
|
18 |
+
"is_gated_act": true,
|
19 |
+
"layer_norm_epsilon": 1e-06,
|
20 |
+
"model_type": "t5",
|
21 |
+
"num_decoder_layers": 12,
|
22 |
+
"num_heads": 12,
|
23 |
+
"num_layers": 12,
|
24 |
+
"output_past": true,
|
25 |
+
"pad_token_id": 3,
|
26 |
+
"relative_attention_max_distance": 128,
|
27 |
+
"relative_attention_num_buckets": 32,
|
28 |
+
"tie_word_embeddings": false,
|
29 |
+
"torch_dtype": "float32",
|
30 |
+
"transformers_version": "4.44.0",
|
31 |
+
"use_cache": true,
|
32 |
+
"vocab_size": 48256
|
33 |
+
}
|
checkpoints/main.log
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[2024-08-09 08:30:30,106][Main][INFO] - Distributed environment: NO
|
2 |
+
Num processes: 1
|
3 |
+
Process index: 0
|
4 |
+
Local process index: 0
|
5 |
+
Device: cuda
|
6 |
+
|
7 |
+
Mixed precision type: bf16
|
8 |
+
|
9 |
+
[2024-08-09 08:30:30,106][Main][INFO] - Working directory is /workspace/nanoT5/logs/2024-08-09/08-30-29-
|
10 |
+
[2024-08-09 08:38:01,730][Main][INFO] - [train] Step 50 out of 80000 | Loss --> 60.113 | Grad_l2 --> 186.709 | Weights_l2 --> 8624.587 | Lr --> 0.004 | Seconds_per_step --> 8.363 |
|
11 |
+
[2024-08-09 08:42:09,928][Main][INFO] - [train] Step 100 out of 80000 | Loss --> 22.120 | Grad_l2 --> 47.074 | Weights_l2 --> 8624.166 | Lr --> 0.004 | Seconds_per_step --> 4.964 |
|
12 |
+
[2024-08-09 08:46:13,808][Main][INFO] - [train] Step 150 out of 80000 | Loss --> 12.856 | Grad_l2 --> 28.865 | Weights_l2 --> 8623.587 | Lr --> 0.004 | Seconds_per_step --> 4.878 |
|
13 |
+
[2024-08-09 08:50:08,941][Main][INFO] - [train] Step 200 out of 80000 | Loss --> 10.357 | Grad_l2 --> 30.528 | Weights_l2 --> 8623.073 | Lr --> 0.004 | Seconds_per_step --> 4.703 |
|
14 |
+
[2024-08-09 08:54:06,924][Main][INFO] - [train] Step 250 out of 80000 | Loss --> 8.792 | Grad_l2 --> 17.202 | Weights_l2 --> 8622.533 | Lr --> 0.004 | Seconds_per_step --> 4.760 |
|
15 |
+
[2024-08-09 08:58:12,688][Main][INFO] - [train] Step 300 out of 80000 | Loss --> 7.720 | Grad_l2 --> 12.189 | Weights_l2 --> 8622.034 | Lr --> 0.004 | Seconds_per_step --> 4.915 |
|
16 |
+
[2024-08-09 09:02:09,434][Main][INFO] - [train] Step 350 out of 80000 | Loss --> 7.276 | Grad_l2 --> 10.214 | Weights_l2 --> 8621.544 | Lr --> 0.004 | Seconds_per_step --> 4.735 |
|
17 |
+
[2024-08-09 09:06:02,511][Main][INFO] - [train] Step 400 out of 80000 | Loss --> 7.054 | Grad_l2 --> 10.111 | Weights_l2 --> 8621.091 | Lr --> 0.004 | Seconds_per_step --> 4.662 |
|
18 |
+
[2024-08-09 09:10:08,058][Main][INFO] - [train] Step 450 out of 80000 | Loss --> 6.941 | Grad_l2 --> 9.960 | Weights_l2 --> 8620.672 | Lr --> 0.004 | Seconds_per_step --> 4.911 |
|
19 |
+
[2024-08-09 09:14:09,465][Main][INFO] - [train] Step 500 out of 80000 | Loss --> 6.777 | Grad_l2 --> 9.558 | Weights_l2 --> 8620.252 | Lr --> 0.004 | Seconds_per_step --> 4.828 |
|
20 |
+
[2024-08-09 09:17:58,397][Main][INFO] - [train] Step 550 out of 80000 | Loss --> 6.730 | Grad_l2 --> 9.024 | Weights_l2 --> 8619.864 | Lr --> 0.004 | Seconds_per_step --> 4.579 |
|
21 |
+
[2024-08-09 09:21:50,846][Main][INFO] - [train] Step 600 out of 80000 | Loss --> 6.626 | Grad_l2 --> 7.926 | Weights_l2 --> 8619.457 | Lr --> 0.004 | Seconds_per_step --> 4.649 |
|
22 |
+
[2024-08-09 09:25:58,874][Main][INFO] - [train] Step 650 out of 80000 | Loss --> 6.504 | Grad_l2 --> 6.422 | Weights_l2 --> 8619.040 | Lr --> 0.004 | Seconds_per_step --> 4.961 |
|
23 |
+
[2024-08-09 09:29:54,562][Main][INFO] - [train] Step 700 out of 80000 | Loss --> 6.425 | Grad_l2 --> 6.909 | Weights_l2 --> 8618.645 | Lr --> 0.004 | Seconds_per_step --> 4.714 |
|
24 |
+
[2024-08-09 09:33:50,054][Main][INFO] - [train] Step 750 out of 80000 | Loss --> 6.413 | Grad_l2 --> 6.699 | Weights_l2 --> 8618.254 | Lr --> 0.004 | Seconds_per_step --> 4.710 |
|
25 |
+
[2024-08-09 09:37:48,669][Main][INFO] - [train] Step 800 out of 80000 | Loss --> 6.339 | Grad_l2 --> 4.883 | Weights_l2 --> 8617.828 | Lr --> 0.004 | Seconds_per_step --> 4.772 |
|
26 |
+
[2024-08-09 09:41:53,765][Main][INFO] - [train] Step 850 out of 80000 | Loss --> 6.305 | Grad_l2 --> 5.402 | Weights_l2 --> 8617.423 | Lr --> 0.004 | Seconds_per_step --> 4.902 |
|
27 |
+
[2024-08-09 09:45:52,215][Main][INFO] - [train] Step 900 out of 80000 | Loss --> 6.254 | Grad_l2 --> 5.631 | Weights_l2 --> 8617.040 | Lr --> 0.004 | Seconds_per_step --> 4.769 |
|
28 |
+
[2024-08-09 09:49:47,148][Main][INFO] - [train] Step 950 out of 80000 | Loss --> 6.232 | Grad_l2 --> 5.005 | Weights_l2 --> 8616.646 | Lr --> 0.004 | Seconds_per_step --> 4.699 |
|
29 |
+
[2024-08-09 09:53:46,382][Main][INFO] - [train] Step 1000 out of 80000 | Loss --> 6.170 | Grad_l2 --> 5.456 | Weights_l2 --> 8616.274 | Lr --> 0.004 | Seconds_per_step --> 4.785 |
|
30 |
+
[2024-08-09 09:57:42,782][Main][INFO] - [train] Step 1050 out of 80000 | Loss --> 6.163 | Grad_l2 --> 3.954 | Weights_l2 --> 8615.859 | Lr --> 0.004 | Seconds_per_step --> 4.728 |
|
31 |
+
[2024-08-09 10:01:39,784][Main][INFO] - [train] Step 1100 out of 80000 | Loss --> 6.153 | Grad_l2 --> 4.661 | Weights_l2 --> 8615.485 | Lr --> 0.004 | Seconds_per_step --> 4.740 |
|
32 |
+
[2024-08-09 10:05:37,074][Main][INFO] - [train] Step 1150 out of 80000 | Loss --> 6.120 | Grad_l2 --> 4.405 | Weights_l2 --> 8615.110 | Lr --> 0.004 | Seconds_per_step --> 4.746 |
|
33 |
+
[2024-08-09 10:09:42,375][Main][INFO] - [train] Step 1200 out of 80000 | Loss --> 6.095 | Grad_l2 --> 4.862 | Weights_l2 --> 8614.756 | Lr --> 0.004 | Seconds_per_step --> 4.906 |
|
34 |
+
[2024-08-09 10:13:44,826][Main][INFO] - [train] Step 1250 out of 80000 | Loss --> 6.065 | Grad_l2 --> 3.995 | Weights_l2 --> 8614.382 | Lr --> 0.004 | Seconds_per_step --> 4.849 |
|
35 |
+
[2024-08-09 10:17:45,169][Main][INFO] - [train] Step 1300 out of 80000 | Loss --> 5.987 | Grad_l2 --> 4.501 | Weights_l2 --> 8614.025 | Lr --> 0.005 | Seconds_per_step --> 4.807 |
|
36 |
+
[2024-08-09 10:21:46,890][Main][INFO] - [train] Step 1350 out of 80000 | Loss --> 6.011 | Grad_l2 --> 4.330 | Weights_l2 --> 8613.671 | Lr --> 0.005 | Seconds_per_step --> 4.834 |
|
37 |
+
[2024-08-09 10:25:46,445][Main][INFO] - [train] Step 1400 out of 80000 | Loss --> 5.968 | Grad_l2 --> 4.033 | Weights_l2 --> 8613.308 | Lr --> 0.005 | Seconds_per_step --> 4.791 |
|
38 |
+
[2024-08-09 10:29:35,135][Main][INFO] - [train] Step 1450 out of 80000 | Loss --> 5.965 | Grad_l2 --> 3.817 | Weights_l2 --> 8612.959 | Lr --> 0.005 | Seconds_per_step --> 4.574 |
|
39 |
+
[2024-08-09 10:33:33,627][Main][INFO] - [train] Step 1500 out of 80000 | Loss --> 5.926 | Grad_l2 --> 3.525 | Weights_l2 --> 8612.605 | Lr --> 0.005 | Seconds_per_step --> 4.770 |
|
40 |
+
[2024-08-09 10:37:31,600][Main][INFO] - [train] Step 1550 out of 80000 | Loss --> 5.908 | Grad_l2 --> 3.178 | Weights_l2 --> 8612.265 | Lr --> 0.005 | Seconds_per_step --> 4.759 |
|
41 |
+
[2024-08-09 10:41:26,179][Main][INFO] - [train] Step 1600 out of 80000 | Loss --> 5.878 | Grad_l2 --> 3.430 | Weights_l2 --> 8611.930 | Lr --> 0.005 | Seconds_per_step --> 4.692 |
|
42 |
+
[2024-08-09 10:45:17,990][Main][INFO] - [train] Step 1650 out of 80000 | Loss --> 5.864 | Grad_l2 --> 3.399 | Weights_l2 --> 8611.598 | Lr --> 0.005 | Seconds_per_step --> 4.636 |
|
43 |
+
[2024-08-09 10:49:16,915][Main][INFO] - [train] Step 1700 out of 80000 | Loss --> 5.845 | Grad_l2 --> 3.266 | Weights_l2 --> 8611.279 | Lr --> 0.005 | Seconds_per_step --> 4.778 |
|
44 |
+
[2024-08-09 10:53:22,739][Main][INFO] - [train] Step 1750 out of 80000 | Loss --> 5.815 | Grad_l2 --> 3.539 | Weights_l2 --> 8610.973 | Lr --> 0.005 | Seconds_per_step --> 4.916 |
|
45 |
+
[2024-08-09 10:57:15,819][Main][INFO] - [train] Step 1800 out of 80000 | Loss --> 5.813 | Grad_l2 --> 3.014 | Weights_l2 --> 8610.660 | Lr --> 0.005 | Seconds_per_step --> 4.662 |
|
46 |
+
[2024-08-09 11:01:07,812][Main][INFO] - [train] Step 1850 out of 80000 | Loss --> 5.781 | Grad_l2 --> 3.157 | Weights_l2 --> 8610.357 | Lr --> 0.005 | Seconds_per_step --> 4.640 |
|
47 |
+
[2024-08-09 11:05:06,130][Main][INFO] - [train] Step 1900 out of 80000 | Loss --> 5.781 | Grad_l2 --> 2.876 | Weights_l2 --> 8610.069 | Lr --> 0.005 | Seconds_per_step --> 4.766 |
|
48 |
+
[2024-08-09 11:09:10,053][Main][INFO] - [train] Step 1950 out of 80000 | Loss --> 5.727 | Grad_l2 --> 3.171 | Weights_l2 --> 8609.783 | Lr --> 0.005 | Seconds_per_step --> 4.878 |
|
49 |
+
[2024-08-09 11:13:04,823][Main][INFO] - [train] Step 2000 out of 80000 | Loss --> 5.701 | Grad_l2 --> 3.384 | Weights_l2 --> 8609.494 | Lr --> 0.005 | Seconds_per_step --> 4.695 |
|
50 |
+
[2024-08-09 11:16:58,015][Main][INFO] - [train] Step 2050 out of 80000 | Loss --> 5.706 | Grad_l2 --> 2.739 | Weights_l2 --> 8609.191 | Lr --> 0.005 | Seconds_per_step --> 4.664 |
|
51 |
+
[2024-08-09 11:21:09,220][Main][INFO] - [train] Step 2100 out of 80000 | Loss --> 5.697 | Grad_l2 --> 2.753 | Weights_l2 --> 8608.924 | Lr --> 0.005 | Seconds_per_step --> 5.024 |
|
52 |
+
[2024-08-09 11:24:59,988][Main][INFO] - [train] Step 2150 out of 80000 | Loss --> 5.679 | Grad_l2 --> 2.713 | Weights_l2 --> 8608.657 | Lr --> 0.005 | Seconds_per_step --> 4.615 |
|
53 |
+
[2024-08-09 11:28:50,211][Main][INFO] - [train] Step 2200 out of 80000 | Loss --> 5.659 | Grad_l2 --> 2.789 | Weights_l2 --> 8608.401 | Lr --> 0.005 | Seconds_per_step --> 4.604 |
|
54 |
+
[2024-08-09 11:32:47,428][Main][INFO] - [train] Step 2250 out of 80000 | Loss --> 5.643 | Grad_l2 --> 3.085 | Weights_l2 --> 8608.150 | Lr --> 0.005 | Seconds_per_step --> 4.744 |
|
55 |
+
[2024-08-09 11:36:52,444][Main][INFO] - [train] Step 2300 out of 80000 | Loss --> 5.606 | Grad_l2 --> 3.170 | Weights_l2 --> 8607.880 | Lr --> 0.005 | Seconds_per_step --> 4.900 |
|
56 |
+
[2024-08-09 11:40:40,829][Main][INFO] - [train] Step 2350 out of 80000 | Loss --> 5.585 | Grad_l2 --> 2.834 | Weights_l2 --> 8607.632 | Lr --> 0.005 | Seconds_per_step --> 4.568 |
|
57 |
+
[2024-08-09 11:44:35,220][Main][INFO] - [train] Step 2400 out of 80000 | Loss --> 5.595 | Grad_l2 --> 2.603 | Weights_l2 --> 8607.391 | Lr --> 0.005 | Seconds_per_step --> 4.688 |
|
58 |
+
[2024-08-09 11:47:52,825][Main][INFO] - [train] Step 2450 out of 80000 | Loss --> 5.571 | Grad_l2 --> 2.616 | Weights_l2 --> 8607.146 | Lr --> 0.005 | Seconds_per_step --> 3.952 |
|
59 |
+
[2024-08-09 11:50:42,712][Main][INFO] - [train] Step 2500 out of 80000 | Loss --> 5.588 | Grad_l2 --> 2.392 | Weights_l2 --> 8606.913 | Lr --> 0.005 | Seconds_per_step --> 3.398 |
|
60 |
+
[2024-08-09 11:54:19,840][Main][INFO] - [train] Step 2550 out of 80000 | Loss --> 5.598 | Grad_l2 --> 3.058 | Weights_l2 --> 8606.708 | Lr --> 0.005 | Seconds_per_step --> 4.343 |
|
61 |
+
[2024-08-09 11:58:07,896][Main][INFO] - [train] Step 2600 out of 80000 | Loss --> 5.554 | Grad_l2 --> 2.508 | Weights_l2 --> 8606.498 | Lr --> 0.005 | Seconds_per_step --> 4.561 |
|
62 |
+
[2024-08-09 12:02:07,989][Main][INFO] - [train] Step 2650 out of 80000 | Loss --> 5.536 | Grad_l2 --> 2.317 | Weights_l2 --> 8606.300 | Lr --> 0.005 | Seconds_per_step --> 4.802 |
|
63 |
+
[2024-08-09 12:06:22,355][Main][INFO] - [train] Step 2700 out of 80000 | Loss --> 5.533 | Grad_l2 --> 2.347 | Weights_l2 --> 8606.121 | Lr --> 0.005 | Seconds_per_step --> 5.087 |
|
64 |
+
[2024-08-09 12:10:05,296][Main][INFO] - [train] Step 2750 out of 80000 | Loss --> 5.502 | Grad_l2 --> 2.522 | Weights_l2 --> 8605.932 | Lr --> 0.005 | Seconds_per_step --> 4.459 |
|
65 |
+
[2024-08-09 12:13:56,942][Main][INFO] - [train] Step 2800 out of 80000 | Loss --> 5.484 | Grad_l2 --> 2.503 | Weights_l2 --> 8605.729 | Lr --> 0.005 | Seconds_per_step --> 4.633 |
|
66 |
+
[2024-08-09 12:17:56,310][Main][INFO] - [train] Step 2850 out of 80000 | Loss --> 5.471 | Grad_l2 --> 2.559 | Weights_l2 --> 8605.524 | Lr --> 0.005 | Seconds_per_step --> 4.787 |
|
67 |
+
[2024-08-09 12:21:50,249][Main][INFO] - [train] Step 2900 out of 80000 | Loss --> 5.463 | Grad_l2 --> 2.446 | Weights_l2 --> 8605.344 | Lr --> 0.005 | Seconds_per_step --> 4.679 |
|
68 |
+
[2024-08-09 12:25:43,300][Main][INFO] - [train] Step 2950 out of 80000 | Loss --> 5.481 | Grad_l2 --> 2.152 | Weights_l2 --> 8605.182 | Lr --> 0.005 | Seconds_per_step --> 4.661 |
|
69 |
+
[2024-08-09 12:29:34,779][Main][INFO] - [train] Step 3000 out of 80000 | Loss --> 5.444 | Grad_l2 --> 2.267 | Weights_l2 --> 8605.025 | Lr --> 0.005 | Seconds_per_step --> 4.630 |
|
70 |
+
[2024-08-09 12:33:43,889][Main][INFO] - [train] Step 3050 out of 80000 | Loss --> 5.445 | Grad_l2 --> 2.029 | Weights_l2 --> 8604.870 | Lr --> 0.005 | Seconds_per_step --> 4.982 |
|
71 |
+
[2024-08-09 12:37:33,552][Main][INFO] - [train] Step 3100 out of 80000 | Loss --> 5.439 | Grad_l2 --> 2.249 | Weights_l2 --> 8604.734 | Lr --> 0.005 | Seconds_per_step --> 4.593 |
|
72 |
+
[2024-08-09 12:41:33,458][Main][INFO] - [train] Step 3150 out of 80000 | Loss --> 5.390 | Grad_l2 --> 2.281 | Weights_l2 --> 8604.574 | Lr --> 0.005 | Seconds_per_step --> 4.798 |
|
73 |
+
[2024-08-09 12:45:28,169][Main][INFO] - [train] Step 3200 out of 80000 | Loss --> 5.395 | Grad_l2 --> 2.124 | Weights_l2 --> 8604.424 | Lr --> 0.005 | Seconds_per_step --> 4.694 |
|
74 |
+
[2024-08-09 12:49:31,716][Main][INFO] - [train] Step 3250 out of 80000 | Loss --> 5.381 | Grad_l2 --> 2.379 | Weights_l2 --> 8604.286 | Lr --> 0.005 | Seconds_per_step --> 4.871 |
|
75 |
+
[2024-08-09 12:53:26,686][Main][INFO] - [train] Step 3300 out of 80000 | Loss --> 5.365 | Grad_l2 --> 2.335 | Weights_l2 --> 8604.130 | Lr --> 0.005 | Seconds_per_step --> 4.699 |
|
76 |
+
[2024-08-09 12:57:18,564][Main][INFO] - [train] Step 3350 out of 80000 | Loss --> 5.365 | Grad_l2 --> 2.185 | Weights_l2 --> 8603.989 | Lr --> 0.005 | Seconds_per_step --> 4.638 |
|
77 |
+
[2024-08-09 13:01:23,837][Main][INFO] - [train] Step 3400 out of 80000 | Loss --> 5.347 | Grad_l2 --> 2.330 | Weights_l2 --> 8603.845 | Lr --> 0.005 | Seconds_per_step --> 4.905 |
|
78 |
+
[2024-08-09 13:05:16,575][Main][INFO] - [train] Step 3450 out of 80000 | Loss --> 5.349 | Grad_l2 --> 1.951 | Weights_l2 --> 8603.727 | Lr --> 0.005 | Seconds_per_step --> 4.655 |
|
79 |
+
[2024-08-09 13:08:27,542][Main][INFO] - [train] Step 3500 out of 80000 | Loss --> 5.356 | Grad_l2 --> 1.986 | Weights_l2 --> 8603.662 | Lr --> 0.005 | Seconds_per_step --> 3.819 |
|
80 |
+
[2024-08-09 13:12:30,541][Main][INFO] - [train] Step 3550 out of 80000 | Loss --> 5.312 | Grad_l2 --> 2.396 | Weights_l2 --> 8603.545 | Lr --> 0.005 | Seconds_per_step --> 4.860 |
|
81 |
+
[2024-08-09 13:16:49,213][Main][INFO] - [train] Step 3600 out of 80000 | Loss --> 5.299 | Grad_l2 --> 2.230 | Weights_l2 --> 8603.411 | Lr --> 0.005 | Seconds_per_step --> 5.173 |
|
82 |
+
[2024-08-09 13:20:53,058][Main][INFO] - [train] Step 3650 out of 80000 | Loss --> 5.307 | Grad_l2 --> 2.386 | Weights_l2 --> 8603.284 | Lr --> 0.005 | Seconds_per_step --> 4.877 |
|
83 |
+
[2024-08-09 13:24:44,487][Main][INFO] - [train] Step 3700 out of 80000 | Loss --> 5.293 | Grad_l2 --> 2.071 | Weights_l2 --> 8603.169 | Lr --> 0.005 | Seconds_per_step --> 4.629 |
|
84 |
+
[2024-08-09 13:28:47,607][Main][INFO] - [train] Step 3750 out of 80000 | Loss --> 5.298 | Grad_l2 --> 2.199 | Weights_l2 --> 8603.065 | Lr --> 0.005 | Seconds_per_step --> 4.862 |
|
85 |
+
[2024-08-09 13:32:52,512][Main][INFO] - [train] Step 3800 out of 80000 | Loss --> 5.277 | Grad_l2 --> 2.091 | Weights_l2 --> 8602.962 | Lr --> 0.006 | Seconds_per_step --> 4.898 |
|
86 |
+
[2024-08-09 13:36:42,719][Main][INFO] - [train] Step 3850 out of 80000 | Loss --> 5.284 | Grad_l2 --> 2.042 | Weights_l2 --> 8602.881 | Lr --> 0.006 | Seconds_per_step --> 4.604 |
|
87 |
+
[2024-08-09 13:40:34,318][Main][INFO] - [train] Step 3900 out of 80000 | Loss --> 5.245 | Grad_l2 --> 2.240 | Weights_l2 --> 8602.781 | Lr --> 0.006 | Seconds_per_step --> 4.632 |
|
88 |
+
[2024-08-09 13:44:45,754][Main][INFO] - [train] Step 3950 out of 80000 | Loss --> 5.245 | Grad_l2 --> 1.955 | Weights_l2 --> 8602.686 | Lr --> 0.006 | Seconds_per_step --> 5.029 |
|
89 |
+
[2024-08-09 13:48:39,099][Main][INFO] - [train] Step 4000 out of 80000 | Loss --> 5.257 | Grad_l2 --> 2.011 | Weights_l2 --> 8602.644 | Lr --> 0.006 | Seconds_per_step --> 4.667 |
|
90 |
+
[2024-08-09 13:52:31,353][Main][INFO] - [train] Step 4050 out of 80000 | Loss --> 5.239 | Grad_l2 --> 1.838 | Weights_l2 --> 8602.573 | Lr --> 0.006 | Seconds_per_step --> 4.645 |
|
91 |
+
[2024-08-09 13:56:29,186][Main][INFO] - [train] Step 4100 out of 80000 | Loss --> 5.238 | Grad_l2 --> 1.935 | Weights_l2 --> 8602.540 | Lr --> 0.006 | Seconds_per_step --> 4.757 |
|
92 |
+
[2024-08-09 14:00:27,682][Main][INFO] - [train] Step 4150 out of 80000 | Loss --> 5.211 | Grad_l2 --> 2.014 | Weights_l2 --> 8602.468 | Lr --> 0.006 | Seconds_per_step --> 4.770 |
|
93 |
+
[2024-08-09 14:04:26,879][Main][INFO] - [train] Step 4200 out of 80000 | Loss --> 5.202 | Grad_l2 --> 2.106 | Weights_l2 --> 8602.418 | Lr --> 0.006 | Seconds_per_step --> 4.784 |
|
94 |
+
[2024-08-09 14:08:26,097][Main][INFO] - [train] Step 4250 out of 80000 | Loss --> 5.194 | Grad_l2 --> 1.876 | Weights_l2 --> 8602.330 | Lr --> 0.006 | Seconds_per_step --> 4.784 |
|
95 |
+
[2024-08-09 14:12:43,883][Main][INFO] - [train] Step 4300 out of 80000 | Loss --> 5.216 | Grad_l2 --> 1.692 | Weights_l2 --> 8602.339 | Lr --> 0.006 | Seconds_per_step --> 5.156 |
|
96 |
+
[2024-08-09 14:16:59,892][Main][INFO] - [train] Step 4350 out of 80000 | Loss --> 5.195 | Grad_l2 --> 1.824 | Weights_l2 --> 8602.342 | Lr --> 0.006 | Seconds_per_step --> 5.120 |
|
97 |
+
[2024-08-09 14:20:57,072][Main][INFO] - [train] Step 4400 out of 80000 | Loss --> 5.193 | Grad_l2 --> 1.640 | Weights_l2 --> 8602.351 | Lr --> 0.006 | Seconds_per_step --> 4.744 |
|
98 |
+
[2024-08-09 14:25:01,683][Main][INFO] - [train] Step 4450 out of 80000 | Loss --> 5.186 | Grad_l2 --> 1.790 | Weights_l2 --> 8602.369 | Lr --> 0.006 | Seconds_per_step --> 4.892 |
|
99 |
+
[2024-08-09 14:29:08,638][Main][INFO] - [train] Step 4500 out of 80000 | Loss --> 5.162 | Grad_l2 --> 1.890 | Weights_l2 --> 8602.364 | Lr --> 0.006 | Seconds_per_step --> 4.939 |
|
100 |
+
[2024-08-09 14:32:58,390][Main][INFO] - [train] Step 4550 out of 80000 | Loss --> 5.136 | Grad_l2 --> 1.776 | Weights_l2 --> 8602.345 | Lr --> 0.006 | Seconds_per_step --> 4.595 |
|
101 |
+
[2024-08-09 14:37:00,248][Main][INFO] - [train] Step 4600 out of 80000 | Loss --> 5.135 | Grad_l2 --> 1.661 | Weights_l2 --> 8602.366 | Lr --> 0.006 | Seconds_per_step --> 4.837 |
|
102 |
+
[2024-08-09 14:41:11,560][Main][INFO] - [train] Step 4650 out of 80000 | Loss --> 5.139 | Grad_l2 --> 1.623 | Weights_l2 --> 8602.434 | Lr --> 0.006 | Seconds_per_step --> 5.026 |
|
103 |
+
[2024-08-09 14:45:14,951][Main][INFO] - [train] Step 4700 out of 80000 | Loss --> 5.090 | Grad_l2 --> 1.703 | Weights_l2 --> 8602.491 | Lr --> 0.006 | Seconds_per_step --> 4.868 |
|
104 |
+
[2024-08-09 14:49:09,655][Main][INFO] - [train] Step 4750 out of 80000 | Loss --> 5.056 | Grad_l2 --> 1.918 | Weights_l2 --> 8602.542 | Lr --> 0.006 | Seconds_per_step --> 4.694 |
|
105 |
+
[2024-08-09 14:53:11,228][Main][INFO] - [train] Step 4800 out of 80000 | Loss --> 5.018 | Grad_l2 --> 1.805 | Weights_l2 --> 8602.552 | Lr --> 0.006 | Seconds_per_step --> 4.831 |
|
106 |
+
[2024-08-09 14:57:15,004][Main][INFO] - [train] Step 4850 out of 80000 | Loss --> 5.016 | Grad_l2 --> 1.660 | Weights_l2 --> 8602.639 | Lr --> 0.006 | Seconds_per_step --> 4.876 |
|
107 |
+
[2024-08-09 15:01:09,698][Main][INFO] - [train] Step 4900 out of 80000 | Loss --> 4.994 | Grad_l2 --> 1.595 | Weights_l2 --> 8602.806 | Lr --> 0.006 | Seconds_per_step --> 4.694 |
|
108 |
+
[2024-08-09 15:04:01,695][Main][INFO] - [train] Step 4950 out of 80000 | Loss --> 4.946 | Grad_l2 --> 1.783 | Weights_l2 --> 8602.949 | Lr --> 0.006 | Seconds_per_step --> 3.440 |
|
109 |
+
[2024-08-09 15:07:39,946][Main][INFO] - [train] Step 5000 out of 80000 | Loss --> 4.722 | Grad_l2 --> 1.590 | Weights_l2 --> 8603.165 | Lr --> 0.006 | Seconds_per_step --> 4.365 |
|
110 |
+
[2024-08-09 15:07:39,947][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-5000
|
111 |
+
[2024-08-09 15:07:39,951][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
112 |
+
[2024-08-09 15:07:46,022][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-5000/model.safetensors
|
113 |
+
[2024-08-09 15:07:49,438][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-5000/optimizer.bin
|
114 |
+
[2024-08-09 15:07:49,439][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-5000/scheduler.bin
|
115 |
+
[2024-08-09 15:07:49,439][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-5000/sampler.bin
|
116 |
+
[2024-08-09 15:07:49,439][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-5000/sampler_1.bin
|
117 |
+
[2024-08-09 15:07:49,440][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-5000/random_states_0.pkl
|
118 |
+
[2024-08-09 15:11:55,741][Main][INFO] - [train] Step 5050 out of 80000 | Loss --> 4.582 | Grad_l2 --> 1.679 | Weights_l2 --> 8603.473 | Lr --> 0.006 | Seconds_per_step --> 5.116 |
|
119 |
+
[2024-08-09 15:15:46,314][Main][INFO] - [train] Step 5100 out of 80000 | Loss --> 4.472 | Grad_l2 --> 1.636 | Weights_l2 --> 8603.746 | Lr --> 0.006 | Seconds_per_step --> 4.611 |
|
120 |
+
[2024-08-09 15:19:45,374][Main][INFO] - [train] Step 5150 out of 80000 | Loss --> 4.370 | Grad_l2 --> 1.523 | Weights_l2 --> 8604.092 | Lr --> 0.006 | Seconds_per_step --> 4.781 |
|
121 |
+
[2024-08-09 15:23:51,223][Main][INFO] - [train] Step 5200 out of 80000 | Loss --> 4.267 | Grad_l2 --> 1.542 | Weights_l2 --> 8604.440 | Lr --> 0.006 | Seconds_per_step --> 4.917 |
|
122 |
+
[2024-08-09 15:27:51,655][Main][INFO] - [train] Step 5250 out of 80000 | Loss --> 4.191 | Grad_l2 --> 1.477 | Weights_l2 --> 8604.872 | Lr --> 0.006 | Seconds_per_step --> 4.809 |
|
123 |
+
[2024-08-09 15:31:44,251][Main][INFO] - [train] Step 5300 out of 80000 | Loss --> 4.128 | Grad_l2 --> 1.490 | Weights_l2 --> 8605.306 | Lr --> 0.006 | Seconds_per_step --> 4.652 |
|
124 |
+
[2024-08-09 15:35:40,470][Main][INFO] - [train] Step 5350 out of 80000 | Loss --> 4.067 | Grad_l2 --> 1.397 | Weights_l2 --> 8605.776 | Lr --> 0.006 | Seconds_per_step --> 4.724 |
|
125 |
+
[2024-08-09 15:39:48,973][Main][INFO] - [train] Step 5400 out of 80000 | Loss --> 4.015 | Grad_l2 --> 1.239 | Weights_l2 --> 8606.428 | Lr --> 0.006 | Seconds_per_step --> 4.970 |
|
126 |
+
[2024-08-09 15:43:39,070][Main][INFO] - [train] Step 5450 out of 80000 | Loss --> 3.968 | Grad_l2 --> 1.219 | Weights_l2 --> 8607.147 | Lr --> 0.006 | Seconds_per_step --> 4.602 |
|
127 |
+
[2024-08-09 15:47:34,049][Main][INFO] - [train] Step 5500 out of 80000 | Loss --> 3.903 | Grad_l2 --> 1.203 | Weights_l2 --> 8607.924 | Lr --> 0.006 | Seconds_per_step --> 4.700 |
|
128 |
+
[2024-08-09 15:51:38,499][Main][INFO] - [train] Step 5550 out of 80000 | Loss --> 3.855 | Grad_l2 --> 1.167 | Weights_l2 --> 8608.720 | Lr --> 0.006 | Seconds_per_step --> 4.889 |
|
129 |
+
[2024-08-09 15:55:46,120][Main][INFO] - [train] Step 5600 out of 80000 | Loss --> 3.815 | Grad_l2 --> 1.111 | Weights_l2 --> 8609.615 | Lr --> 0.006 | Seconds_per_step --> 4.952 |
|
130 |
+
[2024-08-09 15:59:40,828][Main][INFO] - [train] Step 5650 out of 80000 | Loss --> 3.768 | Grad_l2 --> 1.066 | Weights_l2 --> 8610.530 | Lr --> 0.006 | Seconds_per_step --> 4.694 |
|
131 |
+
[2024-08-09 16:03:38,938][Main][INFO] - [train] Step 5700 out of 80000 | Loss --> 3.711 | Grad_l2 --> 1.048 | Weights_l2 --> 8611.436 | Lr --> 0.006 | Seconds_per_step --> 4.762 |
|
132 |
+
[2024-08-09 16:07:49,871][Main][INFO] - [train] Step 5750 out of 80000 | Loss --> 3.675 | Grad_l2 --> 0.998 | Weights_l2 --> 8612.404 | Lr --> 0.006 | Seconds_per_step --> 5.019 |
|
133 |
+
[2024-08-09 16:11:53,420][Main][INFO] - [train] Step 5800 out of 80000 | Loss --> 3.625 | Grad_l2 --> 0.993 | Weights_l2 --> 8613.329 | Lr --> 0.006 | Seconds_per_step --> 4.871 |
|
134 |
+
[2024-08-09 16:15:50,534][Main][INFO] - [train] Step 5850 out of 80000 | Loss --> 3.580 | Grad_l2 --> 0.952 | Weights_l2 --> 8614.289 | Lr --> 0.006 | Seconds_per_step --> 4.742 |
|
135 |
+
[2024-08-09 16:19:45,983][Main][INFO] - [train] Step 5900 out of 80000 | Loss --> 3.545 | Grad_l2 --> 1.014 | Weights_l2 --> 8615.197 | Lr --> 0.006 | Seconds_per_step --> 4.709 |
|
136 |
+
[2024-08-09 16:23:51,342][Main][INFO] - [train] Step 5950 out of 80000 | Loss --> 3.522 | Grad_l2 --> 0.927 | Weights_l2 --> 8616.137 | Lr --> 0.006 | Seconds_per_step --> 4.907 |
|
137 |
+
[2024-08-09 16:27:42,121][Main][INFO] - [train] Step 6000 out of 80000 | Loss --> 3.483 | Grad_l2 --> 0.926 | Weights_l2 --> 8617.066 | Lr --> 0.006 | Seconds_per_step --> 4.616 |
|
138 |
+
[2024-08-09 16:31:41,278][Main][INFO] - [train] Step 6050 out of 80000 | Loss --> 3.455 | Grad_l2 --> 0.886 | Weights_l2 --> 8617.977 | Lr --> 0.006 | Seconds_per_step --> 4.783 |
|
139 |
+
[2024-08-09 16:35:47,786][Main][INFO] - [train] Step 6100 out of 80000 | Loss --> 3.428 | Grad_l2 --> 0.956 | Weights_l2 --> 8618.840 | Lr --> 0.006 | Seconds_per_step --> 4.930 |
|
140 |
+
[2024-08-09 16:39:45,096][Main][INFO] - [train] Step 6150 out of 80000 | Loss --> 3.399 | Grad_l2 --> 0.832 | Weights_l2 --> 8619.684 | Lr --> 0.006 | Seconds_per_step --> 4.746 |
|
141 |
+
[2024-08-09 16:43:41,554][Main][INFO] - [train] Step 6200 out of 80000 | Loss --> 3.377 | Grad_l2 --> 0.868 | Weights_l2 --> 8620.530 | Lr --> 0.006 | Seconds_per_step --> 4.729 |
|
142 |
+
[2024-08-09 16:47:45,442][Main][INFO] - [train] Step 6250 out of 80000 | Loss --> 3.363 | Grad_l2 --> 0.850 | Weights_l2 --> 8621.325 | Lr --> 0.006 | Seconds_per_step --> 4.878 |
|
143 |
+
[2024-08-09 16:51:50,312][Main][INFO] - [train] Step 6300 out of 80000 | Loss --> 3.332 | Grad_l2 --> 0.840 | Weights_l2 --> 8622.117 | Lr --> 0.007 | Seconds_per_step --> 4.897 |
|
144 |
+
[2024-08-09 16:55:47,619][Main][INFO] - [train] Step 6350 out of 80000 | Loss --> 3.311 | Grad_l2 --> 0.875 | Weights_l2 --> 8622.932 | Lr --> 0.007 | Seconds_per_step --> 4.746 |
|
145 |
+
[2024-08-09 16:59:44,744][Main][INFO] - [train] Step 6400 out of 80000 | Loss --> 3.289 | Grad_l2 --> 0.808 | Weights_l2 --> 8623.729 | Lr --> 0.007 | Seconds_per_step --> 4.742 |
|
146 |
+
[2024-08-09 17:03:47,092][Main][INFO] - [train] Step 6450 out of 80000 | Loss --> 3.279 | Grad_l2 --> 0.782 | Weights_l2 --> 8624.498 | Lr --> 0.007 | Seconds_per_step --> 4.847 |
|
147 |
+
[2024-08-09 17:07:51,580][Main][INFO] - [train] Step 6500 out of 80000 | Loss --> 3.250 | Grad_l2 --> 0.812 | Weights_l2 --> 8625.266 | Lr --> 0.007 | Seconds_per_step --> 4.890 |
|
148 |
+
[2024-08-09 17:11:44,444][Main][INFO] - [train] Step 6550 out of 80000 | Loss --> 3.248 | Grad_l2 --> 0.806 | Weights_l2 --> 8626.024 | Lr --> 0.007 | Seconds_per_step --> 4.657 |
|
149 |
+
[2024-08-09 17:15:43,498][Main][INFO] - [train] Step 6600 out of 80000 | Loss --> 3.216 | Grad_l2 --> 0.765 | Weights_l2 --> 8626.794 | Lr --> 0.007 | Seconds_per_step --> 4.781 |
|
150 |
+
[2024-08-09 17:19:50,311][Main][INFO] - [train] Step 6650 out of 80000 | Loss --> 3.209 | Grad_l2 --> 0.793 | Weights_l2 --> 8627.521 | Lr --> 0.007 | Seconds_per_step --> 4.936 |
|
151 |
+
[2024-08-09 17:23:54,093][Main][INFO] - [train] Step 6700 out of 80000 | Loss --> 3.200 | Grad_l2 --> 0.788 | Weights_l2 --> 8628.294 | Lr --> 0.007 | Seconds_per_step --> 4.876 |
|
152 |
+
[2024-08-09 17:27:47,402][Main][INFO] - [train] Step 6750 out of 80000 | Loss --> 3.176 | Grad_l2 --> 0.762 | Weights_l2 --> 8629.053 | Lr --> 0.007 | Seconds_per_step --> 4.666 |
|
153 |
+
[2024-08-09 17:31:49,523][Main][INFO] - [train] Step 6800 out of 80000 | Loss --> 3.170 | Grad_l2 --> 0.778 | Weights_l2 --> 8629.825 | Lr --> 0.007 | Seconds_per_step --> 4.842 |
|
154 |
+
[2024-08-09 17:35:52,826][Main][INFO] - [train] Step 6850 out of 80000 | Loss --> 3.159 | Grad_l2 --> 0.775 | Weights_l2 --> 8630.568 | Lr --> 0.007 | Seconds_per_step --> 4.866 |
|
155 |
+
[2024-08-09 17:39:46,125][Main][INFO] - [train] Step 6900 out of 80000 | Loss --> 3.158 | Grad_l2 --> 0.757 | Weights_l2 --> 8631.325 | Lr --> 0.007 | Seconds_per_step --> 4.666 |
|
156 |
+
[2024-08-09 17:43:39,817][Main][INFO] - [train] Step 6950 out of 80000 | Loss --> 3.138 | Grad_l2 --> 0.766 | Weights_l2 --> 8632.055 | Lr --> 0.007 | Seconds_per_step --> 4.674 |
|
157 |
+
[2024-08-09 17:47:44,929][Main][INFO] - [train] Step 7000 out of 80000 | Loss --> 3.123 | Grad_l2 --> 0.759 | Weights_l2 --> 8632.805 | Lr --> 0.007 | Seconds_per_step --> 4.902 |
|
158 |
+
[2024-08-09 17:51:43,866][Main][INFO] - [train] Step 7050 out of 80000 | Loss --> 3.118 | Grad_l2 --> 0.752 | Weights_l2 --> 8633.540 | Lr --> 0.007 | Seconds_per_step --> 4.779 |
|
159 |
+
[2024-08-09 17:55:42,820][Main][INFO] - [train] Step 7100 out of 80000 | Loss --> 3.103 | Grad_l2 --> 0.757 | Weights_l2 --> 8634.285 | Lr --> 0.007 | Seconds_per_step --> 4.779 |
|
160 |
+
[2024-08-09 17:59:44,322][Main][INFO] - [train] Step 7150 out of 80000 | Loss --> 3.083 | Grad_l2 --> 0.755 | Weights_l2 --> 8635.030 | Lr --> 0.007 | Seconds_per_step --> 4.830 |
|
161 |
+
[2024-08-09 18:03:44,919][Main][INFO] - [train] Step 7200 out of 80000 | Loss --> 3.073 | Grad_l2 --> 0.735 | Weights_l2 --> 8635.760 | Lr --> 0.007 | Seconds_per_step --> 4.812 |
|
162 |
+
[2024-08-09 18:07:37,774][Main][INFO] - [train] Step 7250 out of 80000 | Loss --> 3.055 | Grad_l2 --> 0.718 | Weights_l2 --> 8636.493 | Lr --> 0.007 | Seconds_per_step --> 4.657 |
|
163 |
+
[2024-08-09 18:11:34,198][Main][INFO] - [train] Step 7300 out of 80000 | Loss --> 3.051 | Grad_l2 --> 0.721 | Weights_l2 --> 8637.245 | Lr --> 0.007 | Seconds_per_step --> 4.728 |
|
164 |
+
[2024-08-09 18:15:38,927][Main][INFO] - [train] Step 7350 out of 80000 | Loss --> 3.041 | Grad_l2 --> 0.762 | Weights_l2 --> 8637.991 | Lr --> 0.007 | Seconds_per_step --> 4.895 |
|
165 |
+
[2024-08-09 18:19:42,181][Main][INFO] - [train] Step 7400 out of 80000 | Loss --> 3.031 | Grad_l2 --> 0.720 | Weights_l2 --> 8638.728 | Lr --> 0.007 | Seconds_per_step --> 4.865 |
|
166 |
+
[2024-08-09 18:23:37,911][Main][INFO] - [train] Step 7450 out of 80000 | Loss --> 3.033 | Grad_l2 --> 0.718 | Weights_l2 --> 8639.471 | Lr --> 0.007 | Seconds_per_step --> 4.715 |
|
167 |
+
[2024-08-09 18:27:38,146][Main][INFO] - [train] Step 7500 out of 80000 | Loss --> 3.020 | Grad_l2 --> 0.729 | Weights_l2 --> 8640.206 | Lr --> 0.007 | Seconds_per_step --> 4.805 |
|
168 |
+
[2024-08-09 18:31:39,590][Main][INFO] - [train] Step 7550 out of 80000 | Loss --> 3.004 | Grad_l2 --> 0.734 | Weights_l2 --> 8640.967 | Lr --> 0.007 | Seconds_per_step --> 4.829 |
|
169 |
+
[2024-08-09 18:35:32,805][Main][INFO] - [train] Step 7600 out of 80000 | Loss --> 2.986 | Grad_l2 --> 0.714 | Weights_l2 --> 8641.711 | Lr --> 0.007 | Seconds_per_step --> 4.664 |
|
170 |
+
[2024-08-09 18:39:28,080][Main][INFO] - [train] Step 7650 out of 80000 | Loss --> 2.994 | Grad_l2 --> 0.743 | Weights_l2 --> 8642.483 | Lr --> 0.007 | Seconds_per_step --> 4.705 |
|
171 |
+
[2024-08-09 18:43:37,815][Main][INFO] - [train] Step 7700 out of 80000 | Loss --> 2.980 | Grad_l2 --> 0.699 | Weights_l2 --> 8643.242 | Lr --> 0.007 | Seconds_per_step --> 4.995 |
|
172 |
+
[2024-08-09 18:47:42,799][Main][INFO] - [train] Step 7750 out of 80000 | Loss --> 2.976 | Grad_l2 --> 0.725 | Weights_l2 --> 8643.993 | Lr --> 0.007 | Seconds_per_step --> 4.900 |
|
173 |
+
[2024-08-09 18:51:34,464][Main][INFO] - [train] Step 7800 out of 80000 | Loss --> 2.963 | Grad_l2 --> 0.699 | Weights_l2 --> 8644.781 | Lr --> 0.007 | Seconds_per_step --> 4.633 |
|
174 |
+
[2024-08-09 18:55:32,534][Main][INFO] - [train] Step 7850 out of 80000 | Loss --> 2.954 | Grad_l2 --> 0.706 | Weights_l2 --> 8645.547 | Lr --> 0.007 | Seconds_per_step --> 4.761 |
|
175 |
+
[2024-08-09 18:59:39,507][Main][INFO] - [train] Step 7900 out of 80000 | Loss --> 2.947 | Grad_l2 --> 0.689 | Weights_l2 --> 8646.333 | Lr --> 0.007 | Seconds_per_step --> 4.939 |
|
176 |
+
[2024-08-09 19:03:32,747][Main][INFO] - [train] Step 7950 out of 80000 | Loss --> 2.935 | Grad_l2 --> 0.701 | Weights_l2 --> 8647.099 | Lr --> 0.007 | Seconds_per_step --> 4.665 |
|
177 |
+
[2024-08-09 19:07:42,994][Main][INFO] - [train] Step 8000 out of 80000 | Loss --> 2.940 | Grad_l2 --> 0.709 | Weights_l2 --> 8647.889 | Lr --> 0.007 | Seconds_per_step --> 5.005 |
|
178 |
+
[2024-08-09 19:11:49,930][Main][INFO] - [train] Step 8050 out of 80000 | Loss --> 2.919 | Grad_l2 --> 0.699 | Weights_l2 --> 8648.663 | Lr --> 0.007 | Seconds_per_step --> 4.939 |
|
179 |
+
[2024-08-09 19:16:03,022][Main][INFO] - [train] Step 8100 out of 80000 | Loss --> 2.916 | Grad_l2 --> 0.690 | Weights_l2 --> 8649.453 | Lr --> 0.007 | Seconds_per_step --> 5.062 |
|
180 |
+
[2024-08-09 19:20:05,203][Main][INFO] - [train] Step 8150 out of 80000 | Loss --> 2.914 | Grad_l2 --> 0.712 | Weights_l2 --> 8650.238 | Lr --> 0.007 | Seconds_per_step --> 4.844 |
|
181 |
+
[2024-08-09 19:23:57,007][Main][INFO] - [train] Step 8200 out of 80000 | Loss --> 2.903 | Grad_l2 --> 0.727 | Weights_l2 --> 8651.038 | Lr --> 0.007 | Seconds_per_step --> 4.636 |
|
182 |
+
[2024-08-09 19:28:02,052][Main][INFO] - [train] Step 8250 out of 80000 | Loss --> 2.896 | Grad_l2 --> 0.691 | Weights_l2 --> 8651.842 | Lr --> 0.007 | Seconds_per_step --> 4.901 |
|
183 |
+
[2024-08-09 19:32:01,708][Main][INFO] - [train] Step 8300 out of 80000 | Loss --> 2.889 | Grad_l2 --> 0.703 | Weights_l2 --> 8652.661 | Lr --> 0.007 | Seconds_per_step --> 4.793 |
|
184 |
+
[2024-08-09 19:35:54,542][Main][INFO] - [train] Step 8350 out of 80000 | Loss --> 2.882 | Grad_l2 --> 0.672 | Weights_l2 --> 8653.459 | Lr --> 0.007 | Seconds_per_step --> 4.657 |
|
185 |
+
[2024-08-09 19:39:53,565][Main][INFO] - [train] Step 8400 out of 80000 | Loss --> 2.861 | Grad_l2 --> 0.676 | Weights_l2 --> 8654.299 | Lr --> 0.007 | Seconds_per_step --> 4.780 |
|
186 |
+
[2024-08-09 19:43:54,929][Main][INFO] - [train] Step 8450 out of 80000 | Loss --> 2.870 | Grad_l2 --> 0.680 | Weights_l2 --> 8655.106 | Lr --> 0.007 | Seconds_per_step --> 4.827 |
|
187 |
+
[2024-08-09 19:47:46,390][Main][INFO] - [train] Step 8500 out of 80000 | Loss --> 2.857 | Grad_l2 --> 0.673 | Weights_l2 --> 8655.929 | Lr --> 0.007 | Seconds_per_step --> 4.629 |
|
188 |
+
[2024-08-09 19:51:41,774][Main][INFO] - [train] Step 8550 out of 80000 | Loss --> 2.847 | Grad_l2 --> 0.674 | Weights_l2 --> 8656.760 | Lr --> 0.007 | Seconds_per_step --> 4.708 |
|
189 |
+
[2024-08-09 19:55:50,508][Main][INFO] - [train] Step 8600 out of 80000 | Loss --> 2.838 | Grad_l2 --> 0.679 | Weights_l2 --> 8657.613 | Lr --> 0.007 | Seconds_per_step --> 4.975 |
|
190 |
+
[2024-08-09 19:59:55,899][Main][INFO] - [train] Step 8650 out of 80000 | Loss --> 2.847 | Grad_l2 --> 0.668 | Weights_l2 --> 8658.480 | Lr --> 0.007 | Seconds_per_step --> 4.908 |
|
191 |
+
[2024-08-09 20:03:46,940][Main][INFO] - [train] Step 8700 out of 80000 | Loss --> 2.834 | Grad_l2 --> 0.689 | Weights_l2 --> 8659.322 | Lr --> 0.007 | Seconds_per_step --> 4.621 |
|
192 |
+
[2024-08-09 20:07:40,599][Main][INFO] - [train] Step 8750 out of 80000 | Loss --> 2.814 | Grad_l2 --> 0.665 | Weights_l2 --> 8660.208 | Lr --> 0.007 | Seconds_per_step --> 4.673 |
|
193 |
+
[2024-08-09 20:11:45,521][Main][INFO] - [train] Step 8800 out of 80000 | Loss --> 2.817 | Grad_l2 --> 0.645 | Weights_l2 --> 8661.057 | Lr --> 0.008 | Seconds_per_step --> 4.898 |
|
194 |
+
[2024-08-09 20:15:34,178][Main][INFO] - [train] Step 8850 out of 80000 | Loss --> 2.807 | Grad_l2 --> 0.662 | Weights_l2 --> 8661.931 | Lr --> 0.008 | Seconds_per_step --> 4.573 |
|
195 |
+
[2024-08-09 20:19:09,957][Main][INFO] - [train] Step 8900 out of 80000 | Loss --> 2.806 | Grad_l2 --> 0.671 | Weights_l2 --> 8662.810 | Lr --> 0.008 | Seconds_per_step --> 4.316 |
|
196 |
+
[2024-08-09 20:22:37,497][Main][INFO] - [train] Step 8950 out of 80000 | Loss --> 2.799 | Grad_l2 --> 0.656 | Weights_l2 --> 8663.699 | Lr --> 0.008 | Seconds_per_step --> 4.151 |
|
197 |
+
[2024-08-09 20:26:01,302][Main][INFO] - [train] Step 9000 out of 80000 | Loss --> 2.796 | Grad_l2 --> 0.657 | Weights_l2 --> 8664.591 | Lr --> 0.008 | Seconds_per_step --> 4.076 |
|
198 |
+
[2024-08-09 20:29:28,057][Main][INFO] - [train] Step 9050 out of 80000 | Loss --> 2.787 | Grad_l2 --> 0.650 | Weights_l2 --> 8665.480 | Lr --> 0.008 | Seconds_per_step --> 4.135 |
|
199 |
+
[2024-08-09 20:32:55,736][Main][INFO] - [train] Step 9100 out of 80000 | Loss --> 2.771 | Grad_l2 --> 0.668 | Weights_l2 --> 8666.372 | Lr --> 0.008 | Seconds_per_step --> 4.154 |
|
200 |
+
[2024-08-09 20:36:26,470][Main][INFO] - [train] Step 9150 out of 80000 | Loss --> 2.762 | Grad_l2 --> 0.630 | Weights_l2 --> 8667.256 | Lr --> 0.008 | Seconds_per_step --> 4.215 |
|
201 |
+
[2024-08-09 20:40:02,302][Main][INFO] - [train] Step 9200 out of 80000 | Loss --> 2.764 | Grad_l2 --> 0.668 | Weights_l2 --> 8668.181 | Lr --> 0.008 | Seconds_per_step --> 4.317 |
|
202 |
+
[2024-08-09 20:43:38,319][Main][INFO] - [train] Step 9250 out of 80000 | Loss --> 2.760 | Grad_l2 --> 0.658 | Weights_l2 --> 8669.118 | Lr --> 0.008 | Seconds_per_step --> 4.320 |
|
203 |
+
[2024-08-09 20:47:12,593][Main][INFO] - [train] Step 9300 out of 80000 | Loss --> 2.754 | Grad_l2 --> 0.631 | Weights_l2 --> 8670.046 | Lr --> 0.008 | Seconds_per_step --> 4.285 |
|
204 |
+
[2024-08-09 20:50:50,547][Main][INFO] - [train] Step 9350 out of 80000 | Loss --> 2.748 | Grad_l2 --> 0.659 | Weights_l2 --> 8670.961 | Lr --> 0.008 | Seconds_per_step --> 4.359 |
|
205 |
+
[2024-08-09 20:54:27,164][Main][INFO] - [train] Step 9400 out of 80000 | Loss --> 2.745 | Grad_l2 --> 0.645 | Weights_l2 --> 8671.908 | Lr --> 0.008 | Seconds_per_step --> 4.332 |
|
206 |
+
[2024-08-09 20:57:57,318][Main][INFO] - [train] Step 9450 out of 80000 | Loss --> 2.734 | Grad_l2 --> 0.651 | Weights_l2 --> 8672.837 | Lr --> 0.008 | Seconds_per_step --> 4.203 |
|
207 |
+
[2024-08-09 21:01:27,114][Main][INFO] - [train] Step 9500 out of 80000 | Loss --> 2.724 | Grad_l2 --> 0.651 | Weights_l2 --> 8673.783 | Lr --> 0.008 | Seconds_per_step --> 4.196 |
|
208 |
+
[2024-08-09 21:05:01,540][Main][INFO] - [train] Step 9550 out of 80000 | Loss --> 2.723 | Grad_l2 --> 0.635 | Weights_l2 --> 8674.757 | Lr --> 0.008 | Seconds_per_step --> 4.289 |
|
209 |
+
[2024-08-09 21:08:31,178][Main][INFO] - [train] Step 9600 out of 80000 | Loss --> 2.707 | Grad_l2 --> 0.633 | Weights_l2 --> 8675.741 | Lr --> 0.008 | Seconds_per_step --> 4.193 |
|
210 |
+
[2024-08-09 21:12:04,549][Main][INFO] - [train] Step 9650 out of 80000 | Loss --> 2.705 | Grad_l2 --> 0.662 | Weights_l2 --> 8676.698 | Lr --> 0.008 | Seconds_per_step --> 4.267 |
|
211 |
+
[2024-08-09 21:15:31,359][Main][INFO] - [train] Step 9700 out of 80000 | Loss --> 2.701 | Grad_l2 --> 0.620 | Weights_l2 --> 8677.665 | Lr --> 0.008 | Seconds_per_step --> 4.136 |
|
212 |
+
[2024-08-09 21:19:05,681][Main][INFO] - [train] Step 9750 out of 80000 | Loss --> 2.696 | Grad_l2 --> 0.635 | Weights_l2 --> 8678.669 | Lr --> 0.008 | Seconds_per_step --> 4.286 |
|
213 |
+
[2024-08-09 21:22:39,126][Main][INFO] - [train] Step 9800 out of 80000 | Loss --> 2.698 | Grad_l2 --> 0.652 | Weights_l2 --> 8679.660 | Lr --> 0.008 | Seconds_per_step --> 4.269 |
|
214 |
+
[2024-08-09 21:26:12,926][Main][INFO] - [train] Step 9850 out of 80000 | Loss --> 2.691 | Grad_l2 --> 0.629 | Weights_l2 --> 8680.657 | Lr --> 0.008 | Seconds_per_step --> 4.276 |
|
215 |
+
[2024-08-09 21:29:43,650][Main][INFO] - [train] Step 9900 out of 80000 | Loss --> 2.683 | Grad_l2 --> 0.639 | Weights_l2 --> 8681.671 | Lr --> 0.008 | Seconds_per_step --> 4.214 |
|
216 |
+
[2024-08-09 21:33:15,612][Main][INFO] - [train] Step 9950 out of 80000 | Loss --> 2.678 | Grad_l2 --> 0.624 | Weights_l2 --> 8682.710 | Lr --> 0.008 | Seconds_per_step --> 4.239 |
|
217 |
+
[2024-08-09 21:36:48,784][Main][INFO] - [train] Step 10000 out of 80000 | Loss --> 2.683 | Grad_l2 --> 0.631 | Weights_l2 --> 8683.746 | Lr --> 0.008 | Seconds_per_step --> 4.263 |
|
218 |
+
[2024-08-09 21:36:48,785][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-10000
|
219 |
+
[2024-08-09 21:36:48,789][accelerate.utils.other][WARNING] - Removed shared tensor {'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
|
220 |
+
[2024-08-09 21:36:50,921][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-10000/model.safetensors
|
221 |
+
[2024-08-09 21:36:54,146][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-10000/optimizer.bin
|
222 |
+
[2024-08-09 21:36:54,146][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-10000/scheduler.bin
|
223 |
+
[2024-08-09 21:36:54,146][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-10000/sampler.bin
|
224 |
+
[2024-08-09 21:36:54,147][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-10000/sampler_1.bin
|
225 |
+
[2024-08-09 21:36:54,147][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-10000/random_states_0.pkl
|
226 |
+
[2024-08-09 21:40:24,314][Main][INFO] - [train] Step 10050 out of 80000 | Loss --> 2.672 | Grad_l2 --> 0.620 | Weights_l2 --> 8684.763 | Lr --> 0.008 | Seconds_per_step --> 4.311 |
|
227 |
+
[2024-08-09 21:43:54,934][Main][INFO] - [train] Step 10100 out of 80000 | Loss --> 2.668 | Grad_l2 --> 0.630 | Weights_l2 --> 8685.788 | Lr --> 0.008 | Seconds_per_step --> 4.212 |
|
228 |
+
[2024-08-09 21:47:26,893][Main][INFO] - [train] Step 10150 out of 80000 | Loss --> 2.664 | Grad_l2 --> 0.622 | Weights_l2 --> 8686.819 | Lr --> 0.008 | Seconds_per_step --> 4.239 |
|
229 |
+
[2024-08-09 21:50:55,047][Main][INFO] - [train] Step 10200 out of 80000 | Loss --> 2.647 | Grad_l2 --> 0.609 | Weights_l2 --> 8687.859 | Lr --> 0.008 | Seconds_per_step --> 4.163 |
|
230 |
+
[2024-08-09 21:54:22,462][Main][INFO] - [train] Step 10250 out of 80000 | Loss --> 2.655 | Grad_l2 --> 0.613 | Weights_l2 --> 8688.883 | Lr --> 0.008 | Seconds_per_step --> 4.148 |
|
231 |
+
[2024-08-09 21:57:52,835][Main][INFO] - [train] Step 10300 out of 80000 | Loss --> 2.637 | Grad_l2 --> 0.623 | Weights_l2 --> 8689.917 | Lr --> 0.008 | Seconds_per_step --> 4.207 |
|
232 |
+
[2024-08-09 22:01:30,833][Main][INFO] - [train] Step 10350 out of 80000 | Loss --> 2.650 | Grad_l2 --> 0.636 | Weights_l2 --> 8690.965 | Lr --> 0.008 | Seconds_per_step --> 4.360 |
|
233 |
+
[2024-08-09 22:04:59,449][Main][INFO] - [train] Step 10400 out of 80000 | Loss --> 2.630 | Grad_l2 --> 0.619 | Weights_l2 --> 8691.976 | Lr --> 0.008 | Seconds_per_step --> 4.172 |
|
234 |
+
[2024-08-09 22:08:29,303][Main][INFO] - [train] Step 10450 out of 80000 | Loss --> 2.617 | Grad_l2 --> 0.615 | Weights_l2 --> 8693.000 | Lr --> 0.008 | Seconds_per_step --> 4.197 |
|
235 |
+
[2024-08-09 22:12:03,306][Main][INFO] - [train] Step 10500 out of 80000 | Loss --> 2.627 | Grad_l2 --> 0.615 | Weights_l2 --> 8694.037 | Lr --> 0.008 | Seconds_per_step --> 4.280 |
|
236 |
+
[2024-08-09 22:15:37,789][Main][INFO] - [train] Step 10550 out of 80000 | Loss --> 2.612 | Grad_l2 --> 0.594 | Weights_l2 --> 8695.071 | Lr --> 0.008 | Seconds_per_step --> 4.290 |
|
237 |
+
[2024-08-09 22:19:13,830][Main][INFO] - [train] Step 10600 out of 80000 | Loss --> 2.599 | Grad_l2 --> 0.608 | Weights_l2 --> 8696.095 | Lr --> 0.008 | Seconds_per_step --> 4.321 |
|
238 |
+
[2024-08-09 22:22:47,537][Main][INFO] - [train] Step 10650 out of 80000 | Loss --> 2.598 | Grad_l2 --> 0.619 | Weights_l2 --> 8697.144 | Lr --> 0.008 | Seconds_per_step --> 4.274 |
|
239 |
+
[2024-08-09 22:26:27,089][Main][INFO] - [train] Step 10700 out of 80000 | Loss --> 2.602 | Grad_l2 --> 0.627 | Weights_l2 --> 8698.176 | Lr --> 0.008 | Seconds_per_step --> 4.391 |
|
240 |
+
[2024-08-09 22:30:08,291][Main][INFO] - [train] Step 10750 out of 80000 | Loss --> 2.598 | Grad_l2 --> 0.603 | Weights_l2 --> 8699.195 | Lr --> 0.008 | Seconds_per_step --> 4.424 |
|
241 |
+
[2024-08-09 22:33:50,515][Main][INFO] - [train] Step 10800 out of 80000 | Loss --> 2.600 | Grad_l2 --> 0.615 | Weights_l2 --> 8700.255 | Lr --> 0.008 | Seconds_per_step --> 4.444 |
|
242 |
+
[2024-08-09 22:37:23,733][Main][INFO] - [train] Step 10850 out of 80000 | Loss --> 2.588 | Grad_l2 --> 0.604 | Weights_l2 --> 8701.311 | Lr --> 0.008 | Seconds_per_step --> 4.264 |
|
243 |
+
[2024-08-09 22:40:59,607][Main][INFO] - [train] Step 10900 out of 80000 | Loss --> 2.585 | Grad_l2 --> 0.605 | Weights_l2 --> 8702.327 | Lr --> 0.008 | Seconds_per_step --> 4.317 |
|
244 |
+
[2024-08-09 22:44:33,158][Main][INFO] - [train] Step 10950 out of 80000 | Loss --> 2.581 | Grad_l2 --> 0.595 | Weights_l2 --> 8703.360 | Lr --> 0.008 | Seconds_per_step --> 4.271 |
|
245 |
+
[2024-08-09 22:48:11,589][Main][INFO] - [train] Step 11000 out of 80000 | Loss --> 2.580 | Grad_l2 --> 0.601 | Weights_l2 --> 8704.410 | Lr --> 0.008 | Seconds_per_step --> 4.369 |
|
246 |
+
[2024-08-09 22:51:45,840][Main][INFO] - [train] Step 11050 out of 80000 | Loss --> 2.578 | Grad_l2 --> 0.587 | Weights_l2 --> 8705.448 | Lr --> 0.008 | Seconds_per_step --> 4.285 |
|