diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..d1ebb22262a0790ab9c19bc36dd6c1a7d8a6d58d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20250505_191035-lg5j0rns/run-lg5j0rns.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/dataset_wan.toml b/dataset_wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..212081a4e4f4f6095525508dc34b20a9b24a7f8e --- /dev/null +++ b/dataset_wan.toml @@ -0,0 +1,9 @@ +resolutions = [ 480,] +enable_ar_bucket = false +min_ar = 1.68 +max_ar = 1.88 +num_ar_buckets = 1 +frame_buckets = [21, 29,] +[[directory]] +path = "/workspace/push-in" +num_repeats = 1 diff --git a/epoch10/adapter_config.json b/epoch10/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfcff0f5b204756426e22988f54efb0604e06528 --- /dev/null +++ b/epoch10/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o", + "k", + "v", + "q", + "ffn.0", + "v_img", + "ffn.2", + "k_img" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/epoch10/adapter_model.safetensors b/epoch10/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..59173c29c632d6f6bc013b418ce711e218423780 --- /dev/null +++ b/epoch10/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:289864341f994a6fcf8f5c3414a6dcd23de631a5fe5afad7f264df00541d8a87 +size 359257680 diff --git a/epoch10/wan.toml b/epoch10/wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f2afb3451b60505221f302beb422f2db801526d --- /dev/null +++ b/epoch10/wan.toml @@ -0,0 +1,48 @@ +output_dir = "/workspace/ComfyUI/models/loras/out" +dataset = "/workspace/configs/dataset_wan.toml" +epochs = 1000 +micro_batch_size_per_gpu = 1 +pipeline_stages = 1 +gradient_accumulation_steps = 1 +gradient_clipping = 1.0 +warmup_steps = 40 +activation_checkpointing = true +partition_method = "parameters" +save_dtype = "bfloat16" +caching_batch_size = 1 +steps_per_print = 1 +video_clip_mode = "single_beginning" +save_every_n_epochs = 10 +checkpoint_every_n_minutes = 120 +blocks_to_swap = 20 + +eval_every_n_epochs = 1 +eval_before_first_step = true +eval_micro_batch_size_per_gpu = 1 +eval_gradient_accumulation_steps = 1 + +[model] +type = "wan" +ckpt_path = "/workspace/Wan2.1" +transformer_path = '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors' +llm_path = '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors' +dtype = "bfloat16" +timestep_sample_method = "logit_normal" + +[adapter] +type = "lora" +rank = 32 +dtype = "bfloat16" + +[optimizer] +type = "adamw_optimi" +lr = 1e-5 +betas = [ 0.9, 0.99,] +weight_decay = 0.01 + +[monitoring] +# Set to true and fill in these fields to enable wandb +enable_wandb = true +wandb_api_key = 'f46df1bb828b735bd22f94fff1be190ba5e046f9' +wandb_tracker_name = 'wan-lora' +wandb_run_name = 'wan-lora' diff --git a/epoch20/adapter_config.json b/epoch20/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfcff0f5b204756426e22988f54efb0604e06528 --- /dev/null +++ b/epoch20/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o", + "k", + "v", + "q", + "ffn.0", + "v_img", + "ffn.2", + "k_img" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/epoch20/adapter_model.safetensors b/epoch20/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6a39816fe8278bb5633e41c6e1e4225d8b1b0258 --- /dev/null +++ b/epoch20/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac074113b4b467c929fe5875f6d432eec7de0b0981fbf111b3b4539ceeb81fca +size 359257680 diff --git a/epoch20/wan.toml b/epoch20/wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f2afb3451b60505221f302beb422f2db801526d --- /dev/null +++ b/epoch20/wan.toml @@ -0,0 +1,48 @@ +output_dir = "/workspace/ComfyUI/models/loras/out" +dataset = "/workspace/configs/dataset_wan.toml" +epochs = 1000 +micro_batch_size_per_gpu = 1 +pipeline_stages = 1 +gradient_accumulation_steps = 1 +gradient_clipping = 1.0 +warmup_steps = 40 +activation_checkpointing = true +partition_method = "parameters" +save_dtype = "bfloat16" +caching_batch_size = 1 +steps_per_print = 1 +video_clip_mode = "single_beginning" +save_every_n_epochs = 10 +checkpoint_every_n_minutes = 120 +blocks_to_swap = 20 + +eval_every_n_epochs = 1 +eval_before_first_step = true +eval_micro_batch_size_per_gpu = 1 +eval_gradient_accumulation_steps = 1 + +[model] +type = "wan" +ckpt_path = "/workspace/Wan2.1" +transformer_path = '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors' +llm_path = '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors' +dtype = "bfloat16" +timestep_sample_method = "logit_normal" + +[adapter] +type = "lora" +rank = 32 +dtype = "bfloat16" + +[optimizer] +type = "adamw_optimi" +lr = 1e-5 +betas = [ 0.9, 0.99,] +weight_decay = 0.01 + +[monitoring] +# Set to true and fill in these fields to enable wandb +enable_wandb = true +wandb_api_key = 'f46df1bb828b735bd22f94fff1be190ba5e046f9' +wandb_tracker_name = 'wan-lora' +wandb_run_name = 'wan-lora' diff --git a/epoch30/adapter_config.json b/epoch30/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfcff0f5b204756426e22988f54efb0604e06528 --- /dev/null +++ b/epoch30/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o", + "k", + "v", + "q", + "ffn.0", + "v_img", + "ffn.2", + "k_img" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/epoch30/adapter_model.safetensors b/epoch30/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c17a6444a7a6ef137b281d1045b3705b74d06eaa --- /dev/null +++ b/epoch30/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aa1c81823f31172c95da55350391df8991fbf7149a561445ca1ded87bca7c92 +size 359257680 diff --git a/epoch30/wan.toml b/epoch30/wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f2afb3451b60505221f302beb422f2db801526d --- /dev/null +++ b/epoch30/wan.toml @@ -0,0 +1,48 @@ +output_dir = "/workspace/ComfyUI/models/loras/out" +dataset = "/workspace/configs/dataset_wan.toml" +epochs = 1000 +micro_batch_size_per_gpu = 1 +pipeline_stages = 1 +gradient_accumulation_steps = 1 +gradient_clipping = 1.0 +warmup_steps = 40 +activation_checkpointing = true +partition_method = "parameters" +save_dtype = "bfloat16" +caching_batch_size = 1 +steps_per_print = 1 +video_clip_mode = "single_beginning" +save_every_n_epochs = 10 +checkpoint_every_n_minutes = 120 +blocks_to_swap = 20 + +eval_every_n_epochs = 1 +eval_before_first_step = true +eval_micro_batch_size_per_gpu = 1 +eval_gradient_accumulation_steps = 1 + +[model] +type = "wan" +ckpt_path = "/workspace/Wan2.1" +transformer_path = '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors' +llm_path = '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors' +dtype = "bfloat16" +timestep_sample_method = "logit_normal" + +[adapter] +type = "lora" +rank = 32 +dtype = "bfloat16" + +[optimizer] +type = "adamw_optimi" +lr = 1e-5 +betas = [ 0.9, 0.99,] +weight_decay = 0.01 + +[monitoring] +# Set to true and fill in these fields to enable wandb +enable_wandb = true +wandb_api_key = 'f46df1bb828b735bd22f94fff1be190ba5e046f9' +wandb_tracker_name = 'wan-lora' +wandb_run_name = 'wan-lora' diff --git a/epoch40/adapter_config.json b/epoch40/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfcff0f5b204756426e22988f54efb0604e06528 --- /dev/null +++ b/epoch40/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o", + "k", + "v", + "q", + "ffn.0", + "v_img", + "ffn.2", + "k_img" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/epoch40/adapter_model.safetensors b/epoch40/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9bfe2f1f2f3deabc7e27ee837b1bdea18628976e --- /dev/null +++ b/epoch40/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0f364b8c7a21f9f80bc3a73556cac784dc29a37904d7eb9d139409d93697520 +size 359257680 diff --git a/epoch40/wan.toml b/epoch40/wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f2afb3451b60505221f302beb422f2db801526d --- /dev/null +++ b/epoch40/wan.toml @@ -0,0 +1,48 @@ +output_dir = "/workspace/ComfyUI/models/loras/out" +dataset = "/workspace/configs/dataset_wan.toml" +epochs = 1000 +micro_batch_size_per_gpu = 1 +pipeline_stages = 1 +gradient_accumulation_steps = 1 +gradient_clipping = 1.0 +warmup_steps = 40 +activation_checkpointing = true +partition_method = "parameters" +save_dtype = "bfloat16" +caching_batch_size = 1 +steps_per_print = 1 +video_clip_mode = "single_beginning" +save_every_n_epochs = 10 +checkpoint_every_n_minutes = 120 +blocks_to_swap = 20 + +eval_every_n_epochs = 1 +eval_before_first_step = true +eval_micro_batch_size_per_gpu = 1 +eval_gradient_accumulation_steps = 1 + +[model] +type = "wan" +ckpt_path = "/workspace/Wan2.1" +transformer_path = '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors' +llm_path = '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors' +dtype = "bfloat16" +timestep_sample_method = "logit_normal" + +[adapter] +type = "lora" +rank = 32 +dtype = "bfloat16" + +[optimizer] +type = "adamw_optimi" +lr = 1e-5 +betas = [ 0.9, 0.99,] +weight_decay = 0.01 + +[monitoring] +# Set to true and fill in these fields to enable wandb +enable_wandb = true +wandb_api_key = 'f46df1bb828b735bd22f94fff1be190ba5e046f9' +wandb_tracker_name = 'wan-lora' +wandb_run_name = 'wan-lora' diff --git a/epoch50/adapter_config.json b/epoch50/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfcff0f5b204756426e22988f54efb0604e06528 --- /dev/null +++ b/epoch50/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o", + "k", + "v", + "q", + "ffn.0", + "v_img", + "ffn.2", + "k_img" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/epoch50/adapter_model.safetensors b/epoch50/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..807a359beab9246b278b388efc170de3f67375f9 --- /dev/null +++ b/epoch50/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b26b9b57c892aaf4f39f460930fbdabe4fe065b9b75529d3f8dff22bfec3c62a +size 359257680 diff --git a/epoch50/wan.toml b/epoch50/wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f2afb3451b60505221f302beb422f2db801526d --- /dev/null +++ b/epoch50/wan.toml @@ -0,0 +1,48 @@ +output_dir = "/workspace/ComfyUI/models/loras/out" +dataset = "/workspace/configs/dataset_wan.toml" +epochs = 1000 +micro_batch_size_per_gpu = 1 +pipeline_stages = 1 +gradient_accumulation_steps = 1 +gradient_clipping = 1.0 +warmup_steps = 40 +activation_checkpointing = true +partition_method = "parameters" +save_dtype = "bfloat16" +caching_batch_size = 1 +steps_per_print = 1 +video_clip_mode = "single_beginning" +save_every_n_epochs = 10 +checkpoint_every_n_minutes = 120 +blocks_to_swap = 20 + +eval_every_n_epochs = 1 +eval_before_first_step = true +eval_micro_batch_size_per_gpu = 1 +eval_gradient_accumulation_steps = 1 + +[model] +type = "wan" +ckpt_path = "/workspace/Wan2.1" +transformer_path = '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors' +llm_path = '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors' +dtype = "bfloat16" +timestep_sample_method = "logit_normal" + +[adapter] +type = "lora" +rank = 32 +dtype = "bfloat16" + +[optimizer] +type = "adamw_optimi" +lr = 1e-5 +betas = [ 0.9, 0.99,] +weight_decay = 0.01 + +[monitoring] +# Set to true and fill in these fields to enable wandb +enable_wandb = true +wandb_api_key = 'f46df1bb828b735bd22f94fff1be190ba5e046f9' +wandb_tracker_name = 'wan-lora' +wandb_run_name = 'wan-lora' diff --git a/epoch60/adapter_config.json b/epoch60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfcff0f5b204756426e22988f54efb0604e06528 --- /dev/null +++ b/epoch60/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o", + "k", + "v", + "q", + "ffn.0", + "v_img", + "ffn.2", + "k_img" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/epoch60/adapter_model.safetensors b/epoch60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eb5c11ed5b175001c090942c82afb099d7fd6ca6 --- /dev/null +++ b/epoch60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:018b6bef9b29073808580488549a5a04ab05d9cd7878ec446544d850bcdcade8 +size 359257680 diff --git a/epoch60/wan.toml b/epoch60/wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f2afb3451b60505221f302beb422f2db801526d --- /dev/null +++ b/epoch60/wan.toml @@ -0,0 +1,48 @@ +output_dir = "/workspace/ComfyUI/models/loras/out" +dataset = "/workspace/configs/dataset_wan.toml" +epochs = 1000 +micro_batch_size_per_gpu = 1 +pipeline_stages = 1 +gradient_accumulation_steps = 1 +gradient_clipping = 1.0 +warmup_steps = 40 +activation_checkpointing = true +partition_method = "parameters" +save_dtype = "bfloat16" +caching_batch_size = 1 +steps_per_print = 1 +video_clip_mode = "single_beginning" +save_every_n_epochs = 10 +checkpoint_every_n_minutes = 120 +blocks_to_swap = 20 + +eval_every_n_epochs = 1 +eval_before_first_step = true +eval_micro_batch_size_per_gpu = 1 +eval_gradient_accumulation_steps = 1 + +[model] +type = "wan" +ckpt_path = "/workspace/Wan2.1" +transformer_path = '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors' +llm_path = '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors' +dtype = "bfloat16" +timestep_sample_method = "logit_normal" + +[adapter] +type = "lora" +rank = 32 +dtype = "bfloat16" + +[optimizer] +type = "adamw_optimi" +lr = 1e-5 +betas = [ 0.9, 0.99,] +weight_decay = 0.01 + +[monitoring] +# Set to true and fill in these fields to enable wandb +enable_wandb = true +wandb_api_key = 'f46df1bb828b735bd22f94fff1be190ba5e046f9' +wandb_tracker_name = 'wan-lora' +wandb_run_name = 'wan-lora' diff --git a/epoch70/adapter_config.json b/epoch70/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfcff0f5b204756426e22988f54efb0604e06528 --- /dev/null +++ b/epoch70/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o", + "k", + "v", + "q", + "ffn.0", + "v_img", + "ffn.2", + "k_img" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/epoch70/adapter_model.safetensors b/epoch70/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9c15ecf32dd02651030a97a2e937f7dba6191349 --- /dev/null +++ b/epoch70/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c9fb051fc7e6fad7591faa818e93db55e6d31657425ffa6b1223456295f9cc1 +size 359257680 diff --git a/epoch70/wan.toml b/epoch70/wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f2afb3451b60505221f302beb422f2db801526d --- /dev/null +++ b/epoch70/wan.toml @@ -0,0 +1,48 @@ +output_dir = "/workspace/ComfyUI/models/loras/out" +dataset = "/workspace/configs/dataset_wan.toml" +epochs = 1000 +micro_batch_size_per_gpu = 1 +pipeline_stages = 1 +gradient_accumulation_steps = 1 +gradient_clipping = 1.0 +warmup_steps = 40 +activation_checkpointing = true +partition_method = "parameters" +save_dtype = "bfloat16" +caching_batch_size = 1 +steps_per_print = 1 +video_clip_mode = "single_beginning" +save_every_n_epochs = 10 +checkpoint_every_n_minutes = 120 +blocks_to_swap = 20 + +eval_every_n_epochs = 1 +eval_before_first_step = true +eval_micro_batch_size_per_gpu = 1 +eval_gradient_accumulation_steps = 1 + +[model] +type = "wan" +ckpt_path = "/workspace/Wan2.1" +transformer_path = '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors' +llm_path = '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors' +dtype = "bfloat16" +timestep_sample_method = "logit_normal" + +[adapter] +type = "lora" +rank = 32 +dtype = "bfloat16" + +[optimizer] +type = "adamw_optimi" +lr = 1e-5 +betas = [ 0.9, 0.99,] +weight_decay = 0.01 + +[monitoring] +# Set to true and fill in these fields to enable wandb +enable_wandb = true +wandb_api_key = 'f46df1bb828b735bd22f94fff1be190ba5e046f9' +wandb_tracker_name = 'wan-lora' +wandb_run_name = 'wan-lora' diff --git a/epoch80/adapter_config.json b/epoch80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfcff0f5b204756426e22988f54efb0604e06528 --- /dev/null +++ b/epoch80/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o", + "k", + "v", + "q", + "ffn.0", + "v_img", + "ffn.2", + "k_img" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/epoch80/adapter_model.safetensors b/epoch80/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..60bbaa05089b838b11c9d061b9f838555977fbee --- /dev/null +++ b/epoch80/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55088f6a96ac17cf17b9b1a290f3fd31f1195e378deb49b8a4ebab5af40833ce +size 359257680 diff --git a/epoch80/wan.toml b/epoch80/wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f2afb3451b60505221f302beb422f2db801526d --- /dev/null +++ b/epoch80/wan.toml @@ -0,0 +1,48 @@ +output_dir = "/workspace/ComfyUI/models/loras/out" +dataset = "/workspace/configs/dataset_wan.toml" +epochs = 1000 +micro_batch_size_per_gpu = 1 +pipeline_stages = 1 +gradient_accumulation_steps = 1 +gradient_clipping = 1.0 +warmup_steps = 40 +activation_checkpointing = true +partition_method = "parameters" +save_dtype = "bfloat16" +caching_batch_size = 1 +steps_per_print = 1 +video_clip_mode = "single_beginning" +save_every_n_epochs = 10 +checkpoint_every_n_minutes = 120 +blocks_to_swap = 20 + +eval_every_n_epochs = 1 +eval_before_first_step = true +eval_micro_batch_size_per_gpu = 1 +eval_gradient_accumulation_steps = 1 + +[model] +type = "wan" +ckpt_path = "/workspace/Wan2.1" +transformer_path = '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors' +llm_path = '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors' +dtype = "bfloat16" +timestep_sample_method = "logit_normal" + +[adapter] +type = "lora" +rank = 32 +dtype = "bfloat16" + +[optimizer] +type = "adamw_optimi" +lr = 1e-5 +betas = [ 0.9, 0.99,] +weight_decay = 0.01 + +[monitoring] +# Set to true and fill in these fields to enable wandb +enable_wandb = true +wandb_api_key = 'f46df1bb828b735bd22f94fff1be190ba5e046f9' +wandb_tracker_name = 'wan-lora' +wandb_run_name = 'wan-lora' diff --git a/epoch90/adapter_config.json b/epoch90/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfcff0f5b204756426e22988f54efb0604e06528 --- /dev/null +++ b/epoch90/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o", + "k", + "v", + "q", + "ffn.0", + "v_img", + "ffn.2", + "k_img" + ], + "task_type": null, + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/epoch90/adapter_model.safetensors b/epoch90/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a0cb03f5b1c86db5ecb050486142a5371e1bec3a --- /dev/null +++ b/epoch90/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29c25e965a959758884efd1e90a46c8b4cede98ea1a1ebc31c59fdedc2cb9ee3 +size 359257680 diff --git a/epoch90/wan.toml b/epoch90/wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f2afb3451b60505221f302beb422f2db801526d --- /dev/null +++ b/epoch90/wan.toml @@ -0,0 +1,48 @@ +output_dir = "/workspace/ComfyUI/models/loras/out" +dataset = "/workspace/configs/dataset_wan.toml" +epochs = 1000 +micro_batch_size_per_gpu = 1 +pipeline_stages = 1 +gradient_accumulation_steps = 1 +gradient_clipping = 1.0 +warmup_steps = 40 +activation_checkpointing = true +partition_method = "parameters" +save_dtype = "bfloat16" +caching_batch_size = 1 +steps_per_print = 1 +video_clip_mode = "single_beginning" +save_every_n_epochs = 10 +checkpoint_every_n_minutes = 120 +blocks_to_swap = 20 + +eval_every_n_epochs = 1 +eval_before_first_step = true +eval_micro_batch_size_per_gpu = 1 +eval_gradient_accumulation_steps = 1 + +[model] +type = "wan" +ckpt_path = "/workspace/Wan2.1" +transformer_path = '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors' +llm_path = '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors' +dtype = "bfloat16" +timestep_sample_method = "logit_normal" + +[adapter] +type = "lora" +rank = 32 +dtype = "bfloat16" + +[optimizer] +type = "adamw_optimi" +lr = 1e-5 +betas = [ 0.9, 0.99,] +weight_decay = 0.01 + +[monitoring] +# Set to true and fill in these fields to enable wandb +enable_wandb = true +wandb_api_key = 'f46df1bb828b735bd22f94fff1be190ba5e046f9' +wandb_tracker_name = 'wan-lora' +wandb_run_name = 'wan-lora' diff --git a/events.out.tfevents.1746472272.420c94ca0326.20093.0 b/events.out.tfevents.1746472272.420c94ca0326.20093.0 new file mode 100644 index 0000000000000000000000000000000000000000..4d08ab1a6dab7c5efb9d8b2840003973afc57216 --- /dev/null +++ b/events.out.tfevents.1746472272.420c94ca0326.20093.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc78cf675810341f01a41a7c7ccd6a754c25400995bb4d0046bc0150d9ce6910 +size 196583 diff --git a/global_step1347/layer_00-model_states.pt b/global_step1347/layer_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42aca0cbb86108d21d40999b36c31f9b86048523 --- /dev/null +++ b/global_step1347/layer_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c682c9cc8e731c37498845f3a980635b04094468e55e77553c7a12ccff998f5 +size 920 diff --git a/global_step1347/layer_01-model_states.pt b/global_step1347/layer_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9207d40230294d57d6a9f11af91cf8c82b5bfaa --- /dev/null +++ b/global_step1347/layer_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7100d3e26fa520af5212aadefeb24aaad671b0eaf4ac35796621e87b495b4ee4 +size 8986838 diff --git a/global_step1347/layer_02-model_states.pt b/global_step1347/layer_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5010eaeaac6bbb243c4bfaddd4c5d6d8ee46803c --- /dev/null +++ b/global_step1347/layer_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53604eef3ce9c3701d7c49e833a6471bb8bcc51867208d2ec394fff92482b0d1 +size 8986838 diff --git a/global_step1347/layer_03-model_states.pt b/global_step1347/layer_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b0a072a8d745601c94b09308885a8599e19f47c --- /dev/null +++ b/global_step1347/layer_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03c84057c3ff7784ea7b61dc1e9537396a690fd73974e58774c007e714260cb4 +size 8986838 diff --git a/global_step1347/layer_04-model_states.pt b/global_step1347/layer_04-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a12f6e5dfd43fba9e16cd82c66e6defb3cb162c --- /dev/null +++ b/global_step1347/layer_04-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8357f02b722621308523d92acd7506dccd3f17d3ebd8e7f3d43c1e34e9c8c848 +size 8986838 diff --git a/global_step1347/layer_05-model_states.pt b/global_step1347/layer_05-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..158f600f253df26d288de2f8a13b0115ca6dc312 --- /dev/null +++ b/global_step1347/layer_05-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64c7d27d5a18b0316dbbddb16d1d9bbeac45b2c66ea553b8d152a7df9d5fad1c +size 8986838 diff --git a/global_step1347/layer_06-model_states.pt b/global_step1347/layer_06-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0efd7de1b7ae5e7037b417e9f4833791c5ba3eb9 --- /dev/null +++ b/global_step1347/layer_06-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4624cab288a75205f419b56e51adc265de9dccfa14c67b8571ff3092a9ce1d21 +size 8986838 diff --git a/global_step1347/layer_07-model_states.pt b/global_step1347/layer_07-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80eb2df2352c0570ff92f57d90ce5ba0f28690c3 --- /dev/null +++ b/global_step1347/layer_07-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b23272ad418c071cf4b938a8c32fe87a8b299b902dcc42a3ffd5abd7b34144a +size 8986838 diff --git a/global_step1347/layer_08-model_states.pt b/global_step1347/layer_08-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b541fffae87fbc3c1256214065842580d225cb5 --- /dev/null +++ b/global_step1347/layer_08-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6386616c4a89bb31ecee50c953b444e1b6ef593b8ef31c01b04d1f8a63156875 +size 8986838 diff --git a/global_step1347/layer_09-model_states.pt b/global_step1347/layer_09-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0aadf0947316bcf1522c284d4ea39b6e2705be8 --- /dev/null +++ b/global_step1347/layer_09-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:589e276b89d62900e4d88c5fc3fa96c34da720712431617f86b03c87ffc0635b +size 8986838 diff --git a/global_step1347/layer_10-model_states.pt b/global_step1347/layer_10-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8de173c37c51afe2a6134192e73a503a66ecfd16 --- /dev/null +++ b/global_step1347/layer_10-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80395f67c0189df41d694fe127ac2024e4605b3eb5156c98863600e8d0af0530 +size 8986838 diff --git a/global_step1347/layer_11-model_states.pt b/global_step1347/layer_11-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe051654734c4cb8f1fe072c3d23ad657c21cc7d --- /dev/null +++ b/global_step1347/layer_11-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7735fb65d7188cc33e6628493c940ec8b71901f45942892f197a05ac284d93d9 +size 8986838 diff --git a/global_step1347/layer_12-model_states.pt b/global_step1347/layer_12-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b82f00e26cd1eca5a7fe8041fc0fa364515dd993 --- /dev/null +++ b/global_step1347/layer_12-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51c141f4c5ed2a2c8005f581457364760c1f1916a8bde29b52324f2b529a76f0 +size 8986838 diff --git a/global_step1347/layer_13-model_states.pt b/global_step1347/layer_13-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a85b7094a4cf959f54a8e804785c690b0ffb4a74 --- /dev/null +++ b/global_step1347/layer_13-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54a712e41d40d643b548b7df9040dbfaed0f0f0d5d07bdad0f7670bfca159264 +size 8986838 diff --git a/global_step1347/layer_14-model_states.pt b/global_step1347/layer_14-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a005e8ec0cacf6f98ff467a17093da7b2677a71 --- /dev/null +++ b/global_step1347/layer_14-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c06997565bb8ba811ab0f82a6b185df638e6204fd04f5f5908436789dbd72ba7 +size 8986838 diff --git a/global_step1347/layer_15-model_states.pt b/global_step1347/layer_15-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84a8c8f839d22619a97a48c6ec64a2cfea1857ed --- /dev/null +++ b/global_step1347/layer_15-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b14e50409e4577a754706ef51dddb84291fc28916e332a4097b8dad56ec2a7de +size 8986838 diff --git a/global_step1347/layer_16-model_states.pt b/global_step1347/layer_16-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b213410548d775c4c591e73ad534c2393d3f830c --- /dev/null +++ b/global_step1347/layer_16-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9efc6563cc1bc1277d6322bfa2986291678391a61d75c9e208caac873b5a602e +size 8986838 diff --git a/global_step1347/layer_17-model_states.pt b/global_step1347/layer_17-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..276d845b05fd0f6a1f323d9c21d87581d125d2db --- /dev/null +++ b/global_step1347/layer_17-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75ea56cd7102eedb76d670cfe01055c65b6ee2e10486f48a73911d4a81bca5a0 +size 8986838 diff --git a/global_step1347/layer_18-model_states.pt b/global_step1347/layer_18-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5991fc99396f991fac9e7431837b7c4b381357cd --- /dev/null +++ b/global_step1347/layer_18-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41fd7023adf82f25c5c91a9cbe32ffe763814550cca1269bf2b31f6f8141ae14 +size 8986838 diff --git a/global_step1347/layer_19-model_states.pt b/global_step1347/layer_19-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a0794c83358c3d31923a2992df1012793b71848 --- /dev/null +++ b/global_step1347/layer_19-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a2baedfd12f75feb166421fa9e248835e86c2f4751589513316cd5afb1b37dd +size 8986838 diff --git a/global_step1347/layer_20-model_states.pt b/global_step1347/layer_20-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa8d9541438bbdcfe7b412a5afb3be3a6ab0f84b --- /dev/null +++ b/global_step1347/layer_20-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21e44bcc64c3ff0b4f310f39f71f7f1a9d95d8730088410cf9e6f1b7d265a15e +size 8986838 diff --git a/global_step1347/layer_21-model_states.pt b/global_step1347/layer_21-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33e7e02d940d26e3819a5af54a9c52118bf610f5 --- /dev/null +++ b/global_step1347/layer_21-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9137773d88bca8ba82ac5b4abf8b65a0d2fed57597517c985f1fd0465975a552 +size 8986838 diff --git a/global_step1347/layer_22-model_states.pt b/global_step1347/layer_22-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..750a114f4a758618d8edb269e3411a7b1d6c9763 --- /dev/null +++ b/global_step1347/layer_22-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93bb16670b4ca74bf0aa1f6625e5e11e65df5bb0d1779550cc0b13e75b3df8d7 +size 8986838 diff --git a/global_step1347/layer_23-model_states.pt b/global_step1347/layer_23-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbdb570bff7d30fb409f85eded4813e02cc362e7 --- /dev/null +++ b/global_step1347/layer_23-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:537e8030f4d0d1241629a3f554db9bf5380328135aee1297cec95c27ae3d24e5 +size 8986838 diff --git a/global_step1347/layer_24-model_states.pt b/global_step1347/layer_24-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06b2217ec5ef094ce935ea0ccbdcb00c999af183 --- /dev/null +++ b/global_step1347/layer_24-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cbacd6ba1ad373c91477b59d707fe86f1efd8f357e5a7183374d2f4872b7439 +size 8986838 diff --git a/global_step1347/layer_25-model_states.pt b/global_step1347/layer_25-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3be40e3f26b72b3f47fe2cb4a85fa8da47ecb9d3 --- /dev/null +++ b/global_step1347/layer_25-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caea98f23c0e099d25730b4365f88f81e9cd52bc9e45dc27302e1705f1dc4123 +size 8986838 diff --git a/global_step1347/layer_26-model_states.pt b/global_step1347/layer_26-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28404529766a0c9997df8169cb90fe19f1883e62 --- /dev/null +++ b/global_step1347/layer_26-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce689f3bbd6e0198f0534bb91d7482df1f4200c2132b232907cece0c043115f3 +size 8986838 diff --git a/global_step1347/layer_27-model_states.pt b/global_step1347/layer_27-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04e5e73f6413547f4f02025ed98352c9f248df08 --- /dev/null +++ b/global_step1347/layer_27-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d02556966860e61fbb74243e50bd695ff1e75e7a8978a9a950bb36e7a787e30b +size 8986838 diff --git a/global_step1347/layer_28-model_states.pt b/global_step1347/layer_28-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..50f107c9fb8474a0b2b355de12abaf29bafbfaf3 --- /dev/null +++ b/global_step1347/layer_28-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc1ea8d2fed5ad8eb5eb6245fc99a243efe764ee8c9de772ea795303fb2966db +size 8986838 diff --git a/global_step1347/layer_29-model_states.pt b/global_step1347/layer_29-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01fb15a508a903f3f86ef12dd3ddc073df80b411 --- /dev/null +++ b/global_step1347/layer_29-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a04fa822c9007fbde03ffcdda52694ed49bf3225cac1838ac4fdee819939afd4 +size 8986838 diff --git a/global_step1347/layer_30-model_states.pt b/global_step1347/layer_30-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4de2b2511c2bd17c063714a30d908183f7e204e7 --- /dev/null +++ b/global_step1347/layer_30-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9cbfde9211b5fe40b886ecd3d230706fb071a33b9a13ccce6aae913df49b8fc +size 8986838 diff --git a/global_step1347/layer_31-model_states.pt b/global_step1347/layer_31-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6b8d7a60df456bc8799f851f1e70db3c04d4bb8 --- /dev/null +++ b/global_step1347/layer_31-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2fa0f0a5322911ba4f25afb71d488b6941a6bbc9c88147c8c938e5f458e523d +size 8986838 diff --git a/global_step1347/layer_32-model_states.pt b/global_step1347/layer_32-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5aaacb39517639566739771ab1984e32b9fa0650 --- /dev/null +++ b/global_step1347/layer_32-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97b183f6d988eb4ab38f03b2397cd53691813c4a4bcdd8265a4d928db2fa5ea6 +size 8986838 diff --git a/global_step1347/layer_33-model_states.pt b/global_step1347/layer_33-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d0cfa72e904d9f52f07a1ebbe91ace8f9741e57 --- /dev/null +++ b/global_step1347/layer_33-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b0b28b4f75aa5e7550d8c93ce3d29291d9a224a291c05e8c401cba0c34c96a4 +size 8986838 diff --git a/global_step1347/layer_34-model_states.pt b/global_step1347/layer_34-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af81729221e6de7984e6b802494902a250adacc2 --- /dev/null +++ b/global_step1347/layer_34-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc5f96a08ed422af7c4203fb3f241d845682aba198a8a40ff52ef080881085ef +size 8986838 diff --git a/global_step1347/layer_35-model_states.pt b/global_step1347/layer_35-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ae9da65e254ef82e5d782c8949d8a5e5d7edbd6 --- /dev/null +++ b/global_step1347/layer_35-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f42a603956b0df571af074f429508aa924ef863aa1c0081424fb9d4a39f7d436 +size 8986838 diff --git a/global_step1347/layer_36-model_states.pt b/global_step1347/layer_36-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f66345ed34e5a7833237616d20e638755de78ab3 --- /dev/null +++ b/global_step1347/layer_36-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:178d904e23c8cc5d4c8eb84a811d1969d13b813ac6b6199ac1df3fdc0c7cbd8e +size 8986838 diff --git a/global_step1347/layer_37-model_states.pt b/global_step1347/layer_37-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1da40ee5160953b556ad1602044264f4b5c0ee8 --- /dev/null +++ b/global_step1347/layer_37-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a83195158ca497479c87c6555fb7cce3d2c0eb1ebaabac9f4bfeba4a4ee874c0 +size 8986838 diff --git a/global_step1347/layer_38-model_states.pt b/global_step1347/layer_38-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e654a24b7b372080adfe5aff4fe18c60fcabccb3 --- /dev/null +++ b/global_step1347/layer_38-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b8daead6134ba6c98b2607fea9ba946f13e110e5d8b589288e4a8ca47e52549 +size 8986838 diff --git a/global_step1347/layer_39-model_states.pt b/global_step1347/layer_39-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3314bbcb7ee0d304ffdb63f96881382b2b21cf3b --- /dev/null +++ b/global_step1347/layer_39-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e549a0f2e3629e88e86ccd6afa810d2ca41fb01e279af1e3c3d9c2ea2ccba1ba +size 8986838 diff --git a/global_step1347/layer_40-model_states.pt b/global_step1347/layer_40-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd23b9ae993bf9afeff214fe5caa47adf9a11887 --- /dev/null +++ b/global_step1347/layer_40-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1801bf56e1179a6c8120f81691c1af8d1cc160d8ed30e2336292cfdd56e100a3 +size 8986838 diff --git a/global_step1347/layer_41-model_states.pt b/global_step1347/layer_41-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fd39c77c38740b902e047900c7b8d0060c5dab2 --- /dev/null +++ b/global_step1347/layer_41-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7cd44b9d5e66a50fda4fcfc9b19b410cfcb8f8bb3e5334cbb02f47228905913 +size 920 diff --git a/global_step1347/mp_rank_00_model_states.pt b/global_step1347/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d5ab06321007b1da2c836efcea21079aae1fa6b --- /dev/null +++ b/global_step1347/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32a19b81c696ba250ade816a441eebf8fd0ee2dbca83b432e3c7744dfbb80843 +size 1078301785 diff --git a/global_step2020/layer_00-model_states.pt b/global_step2020/layer_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42aca0cbb86108d21d40999b36c31f9b86048523 --- /dev/null +++ b/global_step2020/layer_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c682c9cc8e731c37498845f3a980635b04094468e55e77553c7a12ccff998f5 +size 920 diff --git a/global_step2020/layer_01-model_states.pt b/global_step2020/layer_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e68c10ce53004963b06ef0890f76c62b71679de --- /dev/null +++ b/global_step2020/layer_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a87da52656f090dae3e975762358d2f50f01094f637c2d90d977547851bfd89a +size 8986838 diff --git a/global_step2020/layer_02-model_states.pt b/global_step2020/layer_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67f725db69eb8f700b37feb4851dcad73807f28e --- /dev/null +++ b/global_step2020/layer_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90638d8dd74f2e6cb7ff5c0a8d0d37e70f6d1662569c66bb9a328197ab8e26a3 +size 8986838 diff --git a/global_step2020/layer_03-model_states.pt b/global_step2020/layer_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9fa4f5cf7dab8e9384b5e5ef293f522d167c2b3 --- /dev/null +++ b/global_step2020/layer_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99a5abbf7ad55d3adfc3f052c37e2d1617535365b6aa804824e70d63914972a4 +size 8986838 diff --git a/global_step2020/layer_04-model_states.pt b/global_step2020/layer_04-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..49ed26de42cd4cc719c6a45775687525139ec3eb --- /dev/null +++ b/global_step2020/layer_04-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff01264ecdc641ff33bf31baab8ac8387f4e24a550681bc04bd4de6e0caf5587 +size 8986838 diff --git a/global_step2020/layer_05-model_states.pt b/global_step2020/layer_05-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61bbc6f4340b5fe01bd62c853e796da36eb736e8 --- /dev/null +++ b/global_step2020/layer_05-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09e8586eed590215d6e2aae89ead184cd22b9364f1ea6948fb0c2392ecfb882c +size 8986838 diff --git a/global_step2020/layer_06-model_states.pt b/global_step2020/layer_06-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f844e751f1f94072cece1dae327b0d76bbab05ca --- /dev/null +++ b/global_step2020/layer_06-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aef4e74aac092fcd33656a39eeebb78de6a48a1c4a361354fab9f9b13547186 +size 8986838 diff --git a/global_step2020/layer_07-model_states.pt b/global_step2020/layer_07-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..250386fc3428a54ceb2d3180a09848a321ef33d3 --- /dev/null +++ b/global_step2020/layer_07-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7386cd7e0f8859fd70da9fa9a16337aa4fa26e8bf82d6486ee42cbdb313a44a8 +size 8986838 diff --git a/global_step2020/layer_08-model_states.pt b/global_step2020/layer_08-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c65f1d7a9422c1c4ef548fa5d2dd2c63c45857e8 --- /dev/null +++ b/global_step2020/layer_08-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87ef016d29b7652a6dbd431358406bd68fd87c892de55fb4058e23e63425a812 +size 8986838 diff --git a/global_step2020/layer_09-model_states.pt b/global_step2020/layer_09-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a382219599c96fc83ad5c762562f1271449777d7 --- /dev/null +++ b/global_step2020/layer_09-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e14a8adba77384d25d1ae20198018ba21dd06e118b1ba3ae7964e0aed6e5a69e +size 8986838 diff --git a/global_step2020/layer_10-model_states.pt b/global_step2020/layer_10-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc462a3d191c8a239b3a74d9c6bbe16d161e50ac --- /dev/null +++ b/global_step2020/layer_10-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c244c08430887c56a76b59fb591afdf65155b178f8255758ecdf936a8d5d3beb +size 8986838 diff --git a/global_step2020/layer_11-model_states.pt b/global_step2020/layer_11-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13da24f3ccf02be49816fd929317f2ca1e8f1085 --- /dev/null +++ b/global_step2020/layer_11-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55eed50752525b070ec4017e8f2678907b9ad0f7c3f38adc918b65d261953982 +size 8986838 diff --git a/global_step2020/layer_12-model_states.pt b/global_step2020/layer_12-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5d5e8b78c1c228332712bf1e4c625e44bf590b2 --- /dev/null +++ b/global_step2020/layer_12-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7273a7d6b9b4a9751d8d0ad33c1b2acbd913a882cc829a62f9eafd060037f4d +size 8986838 diff --git a/global_step2020/layer_13-model_states.pt b/global_step2020/layer_13-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c42ffb8e1a91d3df3925228dc355217aad5c7ef --- /dev/null +++ b/global_step2020/layer_13-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eaf663ffc367c102b525ccad98fa8b79f941e1b1e000667c0d7c9868f2f1e8d +size 8986838 diff --git a/global_step2020/layer_14-model_states.pt b/global_step2020/layer_14-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..380bc7af35c31a2400e7da5efe48db30a6f3818c --- /dev/null +++ b/global_step2020/layer_14-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb1015621af73ba96f4a877a71a70cb7ee27401c244a6c47f8f59c3bd744d015 +size 8986838 diff --git a/global_step2020/layer_15-model_states.pt b/global_step2020/layer_15-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d64ea77d86f49ef48737755fc64843b104b00ab --- /dev/null +++ b/global_step2020/layer_15-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0d7c860195c2af0371f41d24b5ac7a95fca95feed4f891d996ab93d972a6173 +size 8986838 diff --git a/global_step2020/layer_16-model_states.pt b/global_step2020/layer_16-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..955a4df4bbca3541532222cc45394c628fb31d34 --- /dev/null +++ b/global_step2020/layer_16-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bffc692072f656c5aefe8793afe4804ad661453245298abbf420b536b413bd76 +size 8986838 diff --git a/global_step2020/layer_17-model_states.pt b/global_step2020/layer_17-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f041d5737d047aafe5bb4e72b7f6c419cd5a404d --- /dev/null +++ b/global_step2020/layer_17-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e146ad2e0fa00054f6900bf2ca62f5f80883ff7615365ea7844aec4d7405442 +size 8986838 diff --git a/global_step2020/layer_18-model_states.pt b/global_step2020/layer_18-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c173777afd9d99a820aaaf1d39d82c7df2237003 --- /dev/null +++ b/global_step2020/layer_18-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c3d69cbf38ccc97175a3610a4e442f1ae91363d620ca2d3d158219ca5708f85 +size 8986838 diff --git a/global_step2020/layer_19-model_states.pt b/global_step2020/layer_19-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9f0dd091c692ab7ce76da01bad6a41a2d1680c5 --- /dev/null +++ b/global_step2020/layer_19-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31632ae93a3ee88a439565309f498f93318f2d31704fe5de6d716d696505d906 +size 8986838 diff --git a/global_step2020/layer_20-model_states.pt b/global_step2020/layer_20-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bebfebb6e309bf62aa393ba5f7d318daf9245cdb --- /dev/null +++ b/global_step2020/layer_20-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ef356c6d1bd3dd2e01d8d29f9c80f214766904b4d5ce13f8874f61db0b85965 +size 8986838 diff --git a/global_step2020/layer_21-model_states.pt b/global_step2020/layer_21-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e009730bb2c3e97097b7866c369ed82a4e2522d0 --- /dev/null +++ b/global_step2020/layer_21-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcfff754cbd44029108a256c2b10d31967ca6580836e158dbcde88fcc87a26b3 +size 8986838 diff --git a/global_step2020/layer_22-model_states.pt b/global_step2020/layer_22-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d72e85d28689d8495300e8afb83be13ae66a559 --- /dev/null +++ b/global_step2020/layer_22-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f38dfc303480efff5823d0de070d61a9a7fc851dccc15b286beee66f7d863fa5 +size 8986838 diff --git a/global_step2020/layer_23-model_states.pt b/global_step2020/layer_23-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c9550082afe397761a8733d0a33674a05b88749 --- /dev/null +++ b/global_step2020/layer_23-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9c3093f908cc07614fd346b216ffd270028da58e34925bc1e4973181bc90b10 +size 8986838 diff --git a/global_step2020/layer_24-model_states.pt b/global_step2020/layer_24-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6971bbc291746c2e8167a5809315242aa6c06cc --- /dev/null +++ b/global_step2020/layer_24-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2151d30db5d84e05259a2bc003ef95e2c4e18973883da414b6bb3f28327fabc8 +size 8986838 diff --git a/global_step2020/layer_25-model_states.pt b/global_step2020/layer_25-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c500629acb41ce0fa45de1c1798e05eb9699d664 --- /dev/null +++ b/global_step2020/layer_25-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8612ab0ea24d6259bb5fba1dff99e389f642ba63f0227de1a97eb3ea821fa35a +size 8986838 diff --git a/global_step2020/layer_26-model_states.pt b/global_step2020/layer_26-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9277256d480c73ab536a37131aaf38521568e63f --- /dev/null +++ b/global_step2020/layer_26-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec3b135d84e9994bebb8a4fa1f6e7b1b6a6d7c2e518a5b8a41bef77a0a29df90 +size 8986838 diff --git a/global_step2020/layer_27-model_states.pt b/global_step2020/layer_27-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3540a1e2b120244191b43bc662a06c0ccb7d3fed --- /dev/null +++ b/global_step2020/layer_27-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f69ecec672124216750a82cff25a1a0973cc0389d12e996430004df700315cc +size 8986838 diff --git a/global_step2020/layer_28-model_states.pt b/global_step2020/layer_28-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b77500bb879a4e271a09173f3c0df3ba1a166eab --- /dev/null +++ b/global_step2020/layer_28-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:841cea8ec39f92a31eb3a0e403d8193088da4d93d8cf7538aaae316156c59935 +size 8986838 diff --git a/global_step2020/layer_29-model_states.pt b/global_step2020/layer_29-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9aba44752edcf01d5a6b2bd35e0ad2f0ce51a12b --- /dev/null +++ b/global_step2020/layer_29-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a829c0e160bbefc5aa5282a9841b141f40b206840a6754eaf71e8d27eb38353 +size 8986838 diff --git a/global_step2020/layer_30-model_states.pt b/global_step2020/layer_30-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..924a50ca80d1dfd88ba67439f9b932ffb0fd65ab --- /dev/null +++ b/global_step2020/layer_30-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24a4f679265a1d4661b92bed59e62c0f18807195f4194c9ff65707022d730d9d +size 8986838 diff --git a/global_step2020/layer_31-model_states.pt b/global_step2020/layer_31-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdf7740325fa4d826c27b993445eb6e6890d6ae3 --- /dev/null +++ b/global_step2020/layer_31-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98d31a1a84458a443a1531d17eba6d431cb47c2c90fafd3be16b46a3f501ef7c +size 8986838 diff --git a/global_step2020/layer_32-model_states.pt b/global_step2020/layer_32-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df2ca0989bad806257d41d3d061fac61cf2fafd6 --- /dev/null +++ b/global_step2020/layer_32-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b5a6412e06ac8ac048cf151b6ed6dc1b6c2b9550338235ea655aeb0ad06dc54 +size 8986838 diff --git a/global_step2020/layer_33-model_states.pt b/global_step2020/layer_33-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5184343583a8525d8d06a420b79a1a411070e58e --- /dev/null +++ b/global_step2020/layer_33-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:467ae699660ca14569639b5cc348e21594fa4b5395d3df985e6d4ed5911e288d +size 8986838 diff --git a/global_step2020/layer_34-model_states.pt b/global_step2020/layer_34-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2087d604bdca0af2b4c8d3023634db6787577730 --- /dev/null +++ b/global_step2020/layer_34-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74afbe05707df4af9ec7534bbae583ac30710cb8ec7a7d14784bb058a4f7bcf5 +size 8986838 diff --git a/global_step2020/layer_35-model_states.pt b/global_step2020/layer_35-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8681a4ea319afebdf16ed8f78266e3f50161608 --- /dev/null +++ b/global_step2020/layer_35-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6515aed8a01f8cae6906d91070f328ca0c1020b4c79afe51eed7effa0390f8b +size 8986838 diff --git a/global_step2020/layer_36-model_states.pt b/global_step2020/layer_36-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1eeab0513b5ab338f9fb22ef3a4f82702fc56b55 --- /dev/null +++ b/global_step2020/layer_36-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:446b2a947861f66462c26a357a7e52106a0b67eb75b55604623e37152a1b8149 +size 8986838 diff --git a/global_step2020/layer_37-model_states.pt b/global_step2020/layer_37-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8bc6dd7a650192ea8e57cfedb599519dddc183b --- /dev/null +++ b/global_step2020/layer_37-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ce18b1a04316a0e4a9184a1819194e9b7d5904107dd0bc326bd4b363481e317 +size 8986838 diff --git a/global_step2020/layer_38-model_states.pt b/global_step2020/layer_38-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..94cd595fc47406ccf189319d752cd9ee3f05a664 --- /dev/null +++ b/global_step2020/layer_38-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fa7378a7e5a42815a7288f2800810e1dba56cfbe01c9bc2eff454b02a01adc2 +size 8986838 diff --git a/global_step2020/layer_39-model_states.pt b/global_step2020/layer_39-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cfed5a390a9f264c65f027e565e88a6c3a59077 --- /dev/null +++ b/global_step2020/layer_39-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abc1e78bd2de84453d2ccdd1516c21324bb1d58661aae1f1e065d4c6d997f555 +size 8986838 diff --git a/global_step2020/layer_40-model_states.pt b/global_step2020/layer_40-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e2cd3d0403638827c7c4a6d5cd97a14b4bb2d76 --- /dev/null +++ b/global_step2020/layer_40-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49f54dca6d726d7e8c6c9930369918c7e4a42634e84a55a7787aa2f4be80f7f6 +size 8986838 diff --git a/global_step2020/layer_41-model_states.pt b/global_step2020/layer_41-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fd39c77c38740b902e047900c7b8d0060c5dab2 --- /dev/null +++ b/global_step2020/layer_41-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7cd44b9d5e66a50fda4fcfc9b19b410cfcb8f8bb3e5334cbb02f47228905913 +size 920 diff --git a/global_step2020/mp_rank_00_model_states.pt b/global_step2020/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb1c1235b6f1b56a50768314fc91c5ea23ab3b67 --- /dev/null +++ b/global_step2020/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91cbf863cf5328ed6216714e774dae4cc5c429c53b632b176d105491046442c7 +size 1078301785 diff --git a/global_step2693/layer_00-model_states.pt b/global_step2693/layer_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42aca0cbb86108d21d40999b36c31f9b86048523 --- /dev/null +++ b/global_step2693/layer_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c682c9cc8e731c37498845f3a980635b04094468e55e77553c7a12ccff998f5 +size 920 diff --git a/global_step2693/layer_01-model_states.pt b/global_step2693/layer_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5e49436f9107c75c0fd467bbaef9579bcfad45f --- /dev/null +++ b/global_step2693/layer_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe8d4e779d198a73be2d8b36213e77db6b3e76d3d63a0beab2a287b6f038fe8e +size 8986838 diff --git a/global_step2693/layer_02-model_states.pt b/global_step2693/layer_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c11f742c4754cb9f4f637690cb5306fb9725c3f8 --- /dev/null +++ b/global_step2693/layer_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65dbab89e693314800c641e208b5a419443abfe691a7f9c44d70b97e84ef573f +size 8986838 diff --git a/global_step2693/layer_03-model_states.pt b/global_step2693/layer_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76ac39b43694611de4826cf1dee0f96f2365802e --- /dev/null +++ b/global_step2693/layer_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a99e71cac1386a0921af273ee054f2a1ed05a84c88a9071ec28328479aa5182 +size 8986838 diff --git a/global_step2693/layer_04-model_states.pt b/global_step2693/layer_04-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5944f5f686563aae025b97d7ceeb9c33ace7a756 --- /dev/null +++ b/global_step2693/layer_04-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95c0bf035cdcb009c81f7107bc645ad4f96f283c411c3ac15f516df6f57f367 +size 8986838 diff --git a/global_step2693/layer_05-model_states.pt b/global_step2693/layer_05-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8c0f9bd48084ab85e3398ad74d2473239b856d4 --- /dev/null +++ b/global_step2693/layer_05-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a33c50b366cfc337d372d7f9ef0d3a18edbfb99cd1970034a0bfd197130ad242 +size 8986838 diff --git a/global_step2693/layer_06-model_states.pt b/global_step2693/layer_06-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c82efdd5eac9d308f0425084c35f3bee5482c4e --- /dev/null +++ b/global_step2693/layer_06-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a015e01e3937af051a805396b84441e15427d47c141efeadfcfd096941c6d4cb +size 8986838 diff --git a/global_step2693/layer_07-model_states.pt b/global_step2693/layer_07-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc6bf3159667c57aa41d05e7b90d4046200aa196 --- /dev/null +++ b/global_step2693/layer_07-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f406d474e90821453b88dcca85292c311225bcb105666a11a3bc028b5ed9bdf +size 8986838 diff --git a/global_step2693/layer_08-model_states.pt b/global_step2693/layer_08-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0837dd310487ead7eb47d3d2596f3f8eadfdfe07 --- /dev/null +++ b/global_step2693/layer_08-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26667551a4f847272f4d47d0e6c7c0b77536d895ef0efa68e5f604d7b5777f26 +size 8986838 diff --git a/global_step2693/layer_09-model_states.pt b/global_step2693/layer_09-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2f6ec413378b3b9fac6467d4234536fbe873413 --- /dev/null +++ b/global_step2693/layer_09-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b783d3781062b62497bb6b06c5449e699519a8ced269120d1e47866476c649a +size 8986838 diff --git a/global_step2693/layer_10-model_states.pt b/global_step2693/layer_10-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8994ce12260224dc051e4f1f242549b16c7fb2e --- /dev/null +++ b/global_step2693/layer_10-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6154699331ca65f598c612e24523a548ec762fd86104c4a7b4ee956706cf8f7 +size 8986838 diff --git a/global_step2693/layer_11-model_states.pt b/global_step2693/layer_11-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd78485c151fb37408c43e05f9bfc1e8e8cf4f2f --- /dev/null +++ b/global_step2693/layer_11-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50674a770161d44040a4b5e6f9a07a6fb2f86d7b9efa8986fd9c2b25c32e002b +size 8986838 diff --git a/global_step2693/layer_12-model_states.pt b/global_step2693/layer_12-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..894805b3c9e4d35485cbd37d0912b7650fa432f6 --- /dev/null +++ b/global_step2693/layer_12-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc7c5806dff483e716f18e39e71a213858dcc30657f9b02d38cf583bf110b06c +size 8986838 diff --git a/global_step2693/layer_13-model_states.pt b/global_step2693/layer_13-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e831bd86d09a09af2d835483babc0ff7dca6a79c --- /dev/null +++ b/global_step2693/layer_13-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0da1bed8159c3f435a9ddd0627748a55d7211fc57820ca3a28159f23f149daac +size 8986838 diff --git a/global_step2693/layer_14-model_states.pt b/global_step2693/layer_14-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38932effd26381390857108d8dd2ce88f6394b41 --- /dev/null +++ b/global_step2693/layer_14-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3b02b66ae6fd7fe398c40c35ad55d2805457192fa1668e8cd30f7f27e5b8edf +size 8986838 diff --git a/global_step2693/layer_15-model_states.pt b/global_step2693/layer_15-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbd9e0d16a2348186aae2efc131e30a1c12d1c52 --- /dev/null +++ b/global_step2693/layer_15-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf14928d3a1a903cbea7cd963fc5a0b38ce379992bf1453e13ba26c83f55be9 +size 8986838 diff --git a/global_step2693/layer_16-model_states.pt b/global_step2693/layer_16-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6fa333c8d68bcc5abe9a038d5a8fc1eb0b94239 --- /dev/null +++ b/global_step2693/layer_16-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1fc4b58360bc7d4c807cc305055c91dca911dc919f55925303d2efbb9b819d7 +size 8986838 diff --git a/global_step2693/layer_17-model_states.pt b/global_step2693/layer_17-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d76fd24f1b295f394d5e1835e8cea1cc970600b --- /dev/null +++ b/global_step2693/layer_17-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:117006952bc4014887dfe2abb5493c263bfe5a97da11e7852fd8b8bf0bc91580 +size 8986838 diff --git a/global_step2693/layer_18-model_states.pt b/global_step2693/layer_18-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..563afeaeae0063683ebd6fb2935de8c50ba84f69 --- /dev/null +++ b/global_step2693/layer_18-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c76d55fb38685df1de3cf1a29e3993bd60b6ee2f07c94c95d3088c9e7663c2a +size 8986838 diff --git a/global_step2693/layer_19-model_states.pt b/global_step2693/layer_19-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72f385f2f679816d3a65f2eae5d9aefccc48f430 --- /dev/null +++ b/global_step2693/layer_19-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e8fa67e517822c6ff794b6f90a0b28393c97facb700c90365c346c47a21089 +size 8986838 diff --git a/global_step2693/layer_20-model_states.pt b/global_step2693/layer_20-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fe27bbfc66808cfc6a8e5145485f155030868c1 --- /dev/null +++ b/global_step2693/layer_20-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8168d4e60ea0c78fa0f149314e7c2b4f192bffa9e8c53e553c14a9b25a60292c +size 8986838 diff --git a/global_step2693/layer_21-model_states.pt b/global_step2693/layer_21-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e30d46958d375c0b9e4ef77707ff3c55b2c310ea --- /dev/null +++ b/global_step2693/layer_21-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58871125a9701cbde1661167ca41a97bfe6e64c11dd39670e207dd15e82ac9ff +size 8986838 diff --git a/global_step2693/layer_22-model_states.pt b/global_step2693/layer_22-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46822d9b75ef1043d391f2dd1fc3541eabda155c --- /dev/null +++ b/global_step2693/layer_22-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59c8dc463a976e1aef7ce99eac9e4a887add23db0130592aa345469fec836732 +size 8986838 diff --git a/global_step2693/layer_23-model_states.pt b/global_step2693/layer_23-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b6588d31651d5b3ccc1815c96d4f22322f4f54e --- /dev/null +++ b/global_step2693/layer_23-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dffb903f3308a7f69d3080ac311e5b94d60bede503ef7c03e5b11725be1e83b +size 8986838 diff --git a/global_step2693/layer_24-model_states.pt b/global_step2693/layer_24-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51a10646bdcb449510018343c9d028322d8afea5 --- /dev/null +++ b/global_step2693/layer_24-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abddbd6536a18f84030cc22f329031180aa3bdd358ef30dcad466b17a6f43990 +size 8986838 diff --git a/global_step2693/layer_25-model_states.pt b/global_step2693/layer_25-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..012fd7da04f0f0523a599dda156012ffcfaa2c82 --- /dev/null +++ b/global_step2693/layer_25-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f703e504938b0c94f6f887a193770cc5d26e642d3a43b8916e61fbe6893974d +size 8986838 diff --git a/global_step2693/layer_26-model_states.pt b/global_step2693/layer_26-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ea1c311caa36d11fc33f6ae9311de9e28d20746 --- /dev/null +++ b/global_step2693/layer_26-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1910bbdc77e30a01e007ac810f9668342f90ca9c8ed7c668cd8c589fb079043 +size 8986838 diff --git a/global_step2693/layer_27-model_states.pt b/global_step2693/layer_27-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a387e38a016447b515fd8de97d5f919945afc71 --- /dev/null +++ b/global_step2693/layer_27-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ed1518df358bbe13ffdbb87d32ffc3ce119e2593028307c93a44858dfcfd47c +size 8986838 diff --git a/global_step2693/layer_28-model_states.pt b/global_step2693/layer_28-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4ae21dad1fd7a016f3e1808c709aceb5ff1bbec --- /dev/null +++ b/global_step2693/layer_28-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1890e45fc2ccf31ba1b01949cbc634b3a7e1d08d6fb916ba67b2d1a7e9dc74dd +size 8986838 diff --git a/global_step2693/layer_29-model_states.pt b/global_step2693/layer_29-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b2ce47dfff5d89afc4533363b3da5c0c58be747 --- /dev/null +++ b/global_step2693/layer_29-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8068a19a06418343f66b9dee7f69e48d3eb980c84d503171f831f69be9f94058 +size 8986838 diff --git a/global_step2693/layer_30-model_states.pt b/global_step2693/layer_30-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b49ce72e76a49bc0d595fb95e669104654ff26ee --- /dev/null +++ b/global_step2693/layer_30-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc808744cb1d6bd4b6348971cc4603c3634b0fa4fdc35bf8b9815045dbc0bd0e +size 8986838 diff --git a/global_step2693/layer_31-model_states.pt b/global_step2693/layer_31-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9d3e408197630c6321e6179c55238ebbd581ce1 --- /dev/null +++ b/global_step2693/layer_31-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c2eeef06e3fa227c89b97c7c81f002765948cde2c6f16b74aa3565cdfc354a3 +size 8986838 diff --git a/global_step2693/layer_32-model_states.pt b/global_step2693/layer_32-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f51f08eaf492d7459a521f91bdbf9f45399e798b --- /dev/null +++ b/global_step2693/layer_32-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:230658a887e2d781e2ab99ddd477bea256db2bd7c221cb06e141e22f0cfc598a +size 8986838 diff --git a/global_step2693/layer_33-model_states.pt b/global_step2693/layer_33-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8764e4b90a6d5c3029a628a7057baa11c8a6203 --- /dev/null +++ b/global_step2693/layer_33-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac70f4353609c40d9cef391adcd322b73b5f0b4503a8ff93bcddcb4687273d01 +size 8986838 diff --git a/global_step2693/layer_34-model_states.pt b/global_step2693/layer_34-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9805ba68afe687796cadd329bfb9aa0c4dbbb2be --- /dev/null +++ b/global_step2693/layer_34-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f2deaeb9c3265f467fe80a88ff058cf8b8f004ab41edc4e7e8ce3d89188c985 +size 8986838 diff --git a/global_step2693/layer_35-model_states.pt b/global_step2693/layer_35-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d51dc57ef9f8eac068c4a40ca1432d6989c295e8 --- /dev/null +++ b/global_step2693/layer_35-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54b386c3c3d25818cf70ba70395a47a6c59ad0b1ae0515fec11dc46eb9f2ba1e +size 8986838 diff --git a/global_step2693/layer_36-model_states.pt b/global_step2693/layer_36-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..890221afd370c2cdeade451124af72d9b794807e --- /dev/null +++ b/global_step2693/layer_36-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0643d8cdf27453bdfcba39ee2cd2447306f580979e46bb86f65202bd072917e +size 8986838 diff --git a/global_step2693/layer_37-model_states.pt b/global_step2693/layer_37-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7481e37e190ea7bec2ae07d3fca55a9f84bf8fd --- /dev/null +++ b/global_step2693/layer_37-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a18b599bee70dd85bafe84a8eb946924736d5534dbbf8b55eeb74ddb38fdccb6 +size 8986838 diff --git a/global_step2693/layer_38-model_states.pt b/global_step2693/layer_38-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32b2cdf8ab12ca355a64f50a204677541409e9b0 --- /dev/null +++ b/global_step2693/layer_38-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1da363bf373eb1ec3409a36ef6b3840e10a4892a4640881c01d052ea466a0f4 +size 8986838 diff --git a/global_step2693/layer_39-model_states.pt b/global_step2693/layer_39-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b13a4fb1698f2b08c232cb2fa872053ba964d59b --- /dev/null +++ b/global_step2693/layer_39-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe6d3213daee88fed0f3d5b4c2af7a88fd4ca79648fa15d1b337b3e1ec19b31 +size 8986838 diff --git a/global_step2693/layer_40-model_states.pt b/global_step2693/layer_40-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10311e257a4f4561bc7f8de545c0c3de2d2bf836 --- /dev/null +++ b/global_step2693/layer_40-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3acd69268022c8f7597b8d0ceb6ad089bdf4180e465fb68451468db5b9cffcc8 +size 8986838 diff --git a/global_step2693/layer_41-model_states.pt b/global_step2693/layer_41-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fd39c77c38740b902e047900c7b8d0060c5dab2 --- /dev/null +++ b/global_step2693/layer_41-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7cd44b9d5e66a50fda4fcfc9b19b410cfcb8f8bb3e5334cbb02f47228905913 +size 920 diff --git a/global_step2693/mp_rank_00_model_states.pt b/global_step2693/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfd370e1976bad4cfa79a70e480eda829012a299 --- /dev/null +++ b/global_step2693/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:480e7025704e3d268429f29b580bbb8249a14712731d4c5f2b1e2a6b19efc47f +size 1078301785 diff --git a/global_step3366/layer_00-model_states.pt b/global_step3366/layer_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42aca0cbb86108d21d40999b36c31f9b86048523 --- /dev/null +++ b/global_step3366/layer_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c682c9cc8e731c37498845f3a980635b04094468e55e77553c7a12ccff998f5 +size 920 diff --git a/global_step3366/layer_01-model_states.pt b/global_step3366/layer_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..adae15b73edab5932ad0611cbc6075770ba28b02 --- /dev/null +++ b/global_step3366/layer_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:138dab85c8974124bb9325fed3760bd18cd9feaf5b21a173c9035d44e64f475c +size 8986838 diff --git a/global_step3366/layer_02-model_states.pt b/global_step3366/layer_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cf9b0fc980c6f40d8d1858457f16932cb61905d --- /dev/null +++ b/global_step3366/layer_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:172fc7b45ea1fef01ddd3ec8d35cbe768ec0fe51837580955e3fcd6f53c03e65 +size 8986838 diff --git a/global_step3366/layer_03-model_states.pt b/global_step3366/layer_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4059d5f96337b79e1c3b6f164f377d81c2b03c35 --- /dev/null +++ b/global_step3366/layer_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e5c8e09759cc316f28c54e518f279bcae597449cc3ce2a321b57035911b4c02 +size 8986838 diff --git a/global_step3366/layer_04-model_states.pt b/global_step3366/layer_04-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7af441b18dcbe4095cb39dab8a54e9d253f7d1fd --- /dev/null +++ b/global_step3366/layer_04-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46b6d1735758c795d995df2c486485c2dda306e6218c92db5b61f23b572781a7 +size 8986838 diff --git a/global_step3366/layer_05-model_states.pt b/global_step3366/layer_05-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe764eb1a3eff7cd58c663f37252c9ca0b18b69d --- /dev/null +++ b/global_step3366/layer_05-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82417e61da8f84d5a833a920dc2c4705bb3dcb2dcf820dce69464fc381962dda +size 8986838 diff --git a/global_step3366/layer_06-model_states.pt b/global_step3366/layer_06-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1579e7efa7e7ffa384248962a4aa4d7d8d2056a8 --- /dev/null +++ b/global_step3366/layer_06-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b97ac7afa162e88933d53a6cdfe8c911290dfef29f04c8e294d2b609074a60b2 +size 8986838 diff --git a/global_step3366/layer_07-model_states.pt b/global_step3366/layer_07-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e821bda69e295531bace17576151433fd244eec --- /dev/null +++ b/global_step3366/layer_07-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d349d7807c4a0193d76c8654f960ce91a69c09f508af095de180435890f4dca +size 8986838 diff --git a/global_step3366/layer_08-model_states.pt b/global_step3366/layer_08-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c61e8b40cb27914b1745c75bbb11675548c6546 --- /dev/null +++ b/global_step3366/layer_08-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e87766684fda3b9d0022c363a8f2b6647ad739614a07f7b4525f38214baba9 +size 8986838 diff --git a/global_step3366/layer_09-model_states.pt b/global_step3366/layer_09-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2af2a46fdee348cdc808c501f45823764202e6a9 --- /dev/null +++ b/global_step3366/layer_09-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd0d361847a662badc4b21b3f079b518406360fa00011a8ae3d31a727f33a6d4 +size 8986838 diff --git a/global_step3366/layer_10-model_states.pt b/global_step3366/layer_10-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf1f05bc51ac907829d16cfff04afdbc24d42cb7 --- /dev/null +++ b/global_step3366/layer_10-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66d360e6a79957cfb81d102b98e658a41a345e03241cbd4a08757497cc9e9925 +size 8986838 diff --git a/global_step3366/layer_11-model_states.pt b/global_step3366/layer_11-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9e4a11d25d719cde59ee07948f30cd41104d0f7 --- /dev/null +++ b/global_step3366/layer_11-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83c5f245596c68b553c98eac0ff03a0d015389c6a8c25432ebe36a34f1f01893 +size 8986838 diff --git a/global_step3366/layer_12-model_states.pt b/global_step3366/layer_12-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..033c8fec05b77ddc58b10ed49e12ed718ac4d3b0 --- /dev/null +++ b/global_step3366/layer_12-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06fd2d532ab9de25b237331805326ab124ba4a7d60cdd48c30d90abbb58d7310 +size 8986838 diff --git a/global_step3366/layer_13-model_states.pt b/global_step3366/layer_13-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..abec0e9a3702bd26322175032f7a257907015945 --- /dev/null +++ b/global_step3366/layer_13-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cb2601416fb8f0bd8ba05fbdd7d44645e8b0f5ccd0def98a7cb2ee7cedcc754 +size 8986838 diff --git a/global_step3366/layer_14-model_states.pt b/global_step3366/layer_14-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1dc47f4531df11dd3c04a82bdcb2646d52930a73 --- /dev/null +++ b/global_step3366/layer_14-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77a786f3e85ba167f6e89deb7741bbee3efcf15d6968279dcbb492eaca671def +size 8986838 diff --git a/global_step3366/layer_15-model_states.pt b/global_step3366/layer_15-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c14e64109f41195aaa3663e5ee106240ea26743d --- /dev/null +++ b/global_step3366/layer_15-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d87462655a91754a1ebab18a7a4e2f851814429a43618a98072cdce88995c04 +size 8986838 diff --git a/global_step3366/layer_16-model_states.pt b/global_step3366/layer_16-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f3cdba410f6cedd6ffe409c145561400ac73276 --- /dev/null +++ b/global_step3366/layer_16-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816b097f4cb299fdd39db23441a40462375ee2d846853867624d29fc7e660494 +size 8986838 diff --git a/global_step3366/layer_17-model_states.pt b/global_step3366/layer_17-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ab6fb33cfe6893bf4e20e9d3d50bc884cccc80d --- /dev/null +++ b/global_step3366/layer_17-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8ecf7e682ade2269c7f259f8b6967e1bd949f4c003311a89b3420ff48ac8779 +size 8986838 diff --git a/global_step3366/layer_18-model_states.pt b/global_step3366/layer_18-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83404e500a2af9b60ea64e59fcbeb73d2189fe58 --- /dev/null +++ b/global_step3366/layer_18-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78cb5504b2964888f875fa63d902946919811ac160c95e18d16ae21e2928f849 +size 8986838 diff --git a/global_step3366/layer_19-model_states.pt b/global_step3366/layer_19-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ada87941d2aa87c18b8864c3d36f6340371a6b19 --- /dev/null +++ b/global_step3366/layer_19-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f30b88d819ff08e094dbda1ea5e13c25257c74cf1f1d9604f4ad1249a78f2b25 +size 8986838 diff --git a/global_step3366/layer_20-model_states.pt b/global_step3366/layer_20-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db85d9f639dd057f35273fe12ba0db73409e7204 --- /dev/null +++ b/global_step3366/layer_20-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfbac8821dcccdf381d293d92e83bf31b20866d83bf1eb7d62bf7553e261bd3f +size 8986838 diff --git a/global_step3366/layer_21-model_states.pt b/global_step3366/layer_21-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..723f09b070b28e0faa708123505f2812f03d4053 --- /dev/null +++ b/global_step3366/layer_21-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a62d5788c54f5c315680ce72f6ab15ef97b6d65ec5729b089b610042bd86b34d +size 8986838 diff --git a/global_step3366/layer_22-model_states.pt b/global_step3366/layer_22-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62075c0dba7f7aab42ca5a3ef140618df86b11c4 --- /dev/null +++ b/global_step3366/layer_22-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6399d05daaffbc5e505fa55d6d018401f3c47b234d8f88250bbfe2225574805 +size 8986838 diff --git a/global_step3366/layer_23-model_states.pt b/global_step3366/layer_23-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac7ee426579a0ecc935e9451a6601aa9be346ef5 --- /dev/null +++ b/global_step3366/layer_23-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5085b14178acaaf1398500c5e7c681db05df00e987b54e1ebd40ce2627482db +size 8986838 diff --git a/global_step3366/layer_24-model_states.pt b/global_step3366/layer_24-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc32ad5b7617c5eebc817e1b1eb2771b945eaf9f --- /dev/null +++ b/global_step3366/layer_24-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ef53e9ab06a0d55e821ba2eb8c8326ea1d10c81c109c7d2f53f62afa9e8368a +size 8986838 diff --git a/global_step3366/layer_25-model_states.pt b/global_step3366/layer_25-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6beda806814b815cb6dccb1b726c665e8e55a75 --- /dev/null +++ b/global_step3366/layer_25-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c0519ff6db256f8f712d051de32d1bd95e6a89eef0a7e15664dc1ccc8715408 +size 8986838 diff --git a/global_step3366/layer_26-model_states.pt b/global_step3366/layer_26-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d755d8f76a92f345a47eefece97127748e073a4 --- /dev/null +++ b/global_step3366/layer_26-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88d2d11cd48bc30afaef158e465911f11915ba06b4da7a5173367573bbca9506 +size 8986838 diff --git a/global_step3366/layer_27-model_states.pt b/global_step3366/layer_27-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..237aa882b8095bd205050fa89e902293b3481176 --- /dev/null +++ b/global_step3366/layer_27-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:464a5e0238cdb96f6bf00b9eb5983b597620504ca93e687db007d4c53ed7c986 +size 8986838 diff --git a/global_step3366/layer_28-model_states.pt b/global_step3366/layer_28-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edb29fa9343da7c84fff6ab5e896040db9a9b0e3 --- /dev/null +++ b/global_step3366/layer_28-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8173af6caa72b82ebb9a4c365f520e4baef773754eb424826f9f14627deb6575 +size 8986838 diff --git a/global_step3366/layer_29-model_states.pt b/global_step3366/layer_29-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f609d33dcf3fc6199871fbb86e1af34773990544 --- /dev/null +++ b/global_step3366/layer_29-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa97efe985fe51233935a8298c4d8e71f3e38f7f3f8743d609275fdb87f3cbb0 +size 8986838 diff --git a/global_step3366/layer_30-model_states.pt b/global_step3366/layer_30-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..201f17a62d44c2e189b4c5fe7f75a6e0187aa064 --- /dev/null +++ b/global_step3366/layer_30-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f6651db37eb68b4c9420bc44675a95ed32281e1bc5f2ea404fc8cf2a3a46f76 +size 8986838 diff --git a/global_step3366/layer_31-model_states.pt b/global_step3366/layer_31-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f46b4b1f9c6746b472230e9ad2f87d20795bd677 --- /dev/null +++ b/global_step3366/layer_31-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b23f7ca5f6e084edb2dea153c289c9406bc52bc1f7fff5beb57b8190543da78 +size 8986838 diff --git a/global_step3366/layer_32-model_states.pt b/global_step3366/layer_32-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d01a2108f5943bb849f8cd636e306ce2d00b5d6a --- /dev/null +++ b/global_step3366/layer_32-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f4e5b76ae14a770ef21d99a30d2594778a8c77118d54251e5ad7f680b19a359 +size 8986838 diff --git a/global_step3366/layer_33-model_states.pt b/global_step3366/layer_33-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c929eaa044ef0a1636a63eb45d3e29da9202a43 --- /dev/null +++ b/global_step3366/layer_33-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20a391b9a464d99bdc5cae6def8df7ea1e36ce8c08a59e3813de5a4b88600313 +size 8986838 diff --git a/global_step3366/layer_34-model_states.pt b/global_step3366/layer_34-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a14805efd0db8bed8983deb2887b39cac1f1225 --- /dev/null +++ b/global_step3366/layer_34-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fdb633233676704e745f2bbb821d47d4dd8d3e35627ec5b2ae8c912dcf6083f +size 8986838 diff --git a/global_step3366/layer_35-model_states.pt b/global_step3366/layer_35-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e339a3f5951dcd3405697764c97203258f840d88 --- /dev/null +++ b/global_step3366/layer_35-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8696f9aa6a3cb68e19ff74f9267c3947560a14c2eb9d05d64907a64dcfc5f8de +size 8986838 diff --git a/global_step3366/layer_36-model_states.pt b/global_step3366/layer_36-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fd00fa4679da3ad2f5230d3c62f86e9f2465aa6 --- /dev/null +++ b/global_step3366/layer_36-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcf1408802693777cad02ac650dfb4318712ab64464ba7a3574976a41bd8fdb7 +size 8986838 diff --git a/global_step3366/layer_37-model_states.pt b/global_step3366/layer_37-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c386a5d071dce7c69c1445f5bd3aa185d131a93 --- /dev/null +++ b/global_step3366/layer_37-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f5ba183215387e6d6476e5204b0689ff8f8aba33e70e0d7fa4597cbfaf60f47 +size 8986838 diff --git a/global_step3366/layer_38-model_states.pt b/global_step3366/layer_38-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a641e8ae8b00969783252e6917e98f04839a367 --- /dev/null +++ b/global_step3366/layer_38-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8379624c6be39390d9d4491e5d6821e8d2909a77e316d5906aef58ec55cf4672 +size 8986838 diff --git a/global_step3366/layer_39-model_states.pt b/global_step3366/layer_39-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c071153d280af7b9d80d8222f9e0c096fe06b176 --- /dev/null +++ b/global_step3366/layer_39-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7087e5f986fec8548d18e7a5e043a28b447ac52e9d203fef1dbcc822f0f502e +size 8986838 diff --git a/global_step3366/layer_40-model_states.pt b/global_step3366/layer_40-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc14e61acd55f793b74d22c8985d9969e7b25c74 --- /dev/null +++ b/global_step3366/layer_40-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a060474bbf5728e5ff2d647229e9a71c2510390f5f1d1581f42fb90ba8e99e0 +size 8986838 diff --git a/global_step3366/layer_41-model_states.pt b/global_step3366/layer_41-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fd39c77c38740b902e047900c7b8d0060c5dab2 --- /dev/null +++ b/global_step3366/layer_41-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7cd44b9d5e66a50fda4fcfc9b19b410cfcb8f8bb3e5334cbb02f47228905913 +size 920 diff --git a/global_step3366/mp_rank_00_model_states.pt b/global_step3366/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..456403d7eb8e799b9b661ddc243888fe2d9d2333 --- /dev/null +++ b/global_step3366/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f477fc29631ae95b7724ce294ee86a6c429a850a4c4c87094408ecea8d4da468 +size 1078301785 diff --git a/global_step674/layer_00-model_states.pt b/global_step674/layer_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42aca0cbb86108d21d40999b36c31f9b86048523 --- /dev/null +++ b/global_step674/layer_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c682c9cc8e731c37498845f3a980635b04094468e55e77553c7a12ccff998f5 +size 920 diff --git a/global_step674/layer_01-model_states.pt b/global_step674/layer_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7e0f59b5ddcc607e56d1621f41ca670a2bf99a9 --- /dev/null +++ b/global_step674/layer_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84286080d69a535a39b766429ab1849ae6c9b2f5af35fa072cb554f7a21c2de9 +size 8986838 diff --git a/global_step674/layer_02-model_states.pt b/global_step674/layer_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69cc5c8f2876d966b3df9a1562d65b49f7a42088 --- /dev/null +++ b/global_step674/layer_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2aab8bcf98c0c712f4b7ca1f6b30c32aecc3ad500f66a80f3437f94d466484c +size 8986838 diff --git a/global_step674/layer_03-model_states.pt b/global_step674/layer_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72bd5ade0090052640715b8fb447a5cc848d53d5 --- /dev/null +++ b/global_step674/layer_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5df49d9430c8f7ca27a364996320ef680a53d8d18aae66ee93094acb698266c2 +size 8986838 diff --git a/global_step674/layer_04-model_states.pt b/global_step674/layer_04-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b656dc423da9eff2ac9cf3f3c1b5d484101dd025 --- /dev/null +++ b/global_step674/layer_04-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f03d0765af77a51edb7bbb3d3d9f790a258c1dbaf13bd41506677276cb7e9a8 +size 8986838 diff --git a/global_step674/layer_05-model_states.pt b/global_step674/layer_05-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1476e24f2d872838d657202fdc9c717d29d38f6d --- /dev/null +++ b/global_step674/layer_05-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceba3c876e5bb8fd788747081b09dbbcd76d70b0655f05dc0d452369b460ec90 +size 8986838 diff --git a/global_step674/layer_06-model_states.pt b/global_step674/layer_06-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6b7ab8045b2455ad450f9cba9fb44066e2198f3 --- /dev/null +++ b/global_step674/layer_06-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5df92ba23834efa5c34afe2e6be4635fc4cd8681bab1c91617622fb45d9429c8 +size 8986838 diff --git a/global_step674/layer_07-model_states.pt b/global_step674/layer_07-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f28bf5b33d526c5d6df9093bdf94cda9fc4a846 --- /dev/null +++ b/global_step674/layer_07-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86917d2c46f65fd311e8024510fe1128d97728eb62eda39cb72a7a556d9c9a10 +size 8986838 diff --git a/global_step674/layer_08-model_states.pt b/global_step674/layer_08-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7054cbb8b98808cb90d214122593ec48d4bed91 --- /dev/null +++ b/global_step674/layer_08-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df846ff6545e60926293aa70f4bab174b28034209b3536c7f6dbd5b1f7f4d3ee +size 8986838 diff --git a/global_step674/layer_09-model_states.pt b/global_step674/layer_09-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b747408405671055e034ff7af51b705e83617a8 --- /dev/null +++ b/global_step674/layer_09-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4f5866ef22df2ead0d9d547893bb9ff486619be68209ab82abbf5e03b93b35a +size 8986838 diff --git a/global_step674/layer_10-model_states.pt b/global_step674/layer_10-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5441636f19386c99312719fb6ba4de628e6b643e --- /dev/null +++ b/global_step674/layer_10-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cdfdc5cccd939647ac74fe23310fb9244484df31237a06170173dee8a02e4e9 +size 8986838 diff --git a/global_step674/layer_11-model_states.pt b/global_step674/layer_11-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82095c946dea2739dba397007fe80885f55b0c09 --- /dev/null +++ b/global_step674/layer_11-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03ca20e8d6349122e918985c52c77d13c9091084564610a511135ab3b4ded33e +size 8986838 diff --git a/global_step674/layer_12-model_states.pt b/global_step674/layer_12-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db49d9c9f72f9944ba9553654dbff693f7cd6896 --- /dev/null +++ b/global_step674/layer_12-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:328bd5a6f3e22a5dce7ed1d7bfed14504a33d9542320e2c48d856f2b43694bfd +size 8986838 diff --git a/global_step674/layer_13-model_states.pt b/global_step674/layer_13-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2058908a038f91e810e7646b3b47a5cdcc648410 --- /dev/null +++ b/global_step674/layer_13-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce85aba9463fcc133c2fd5344564f42ba68d5c417828d81f9d0ecdaaae7f11fb +size 8986838 diff --git a/global_step674/layer_14-model_states.pt b/global_step674/layer_14-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..358f987d265b805cb9302299767ba6438bfa3d97 --- /dev/null +++ b/global_step674/layer_14-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3881660dcdf0cfbf9aeab44c75cba46db0a967eade5354bda02734575321c0a8 +size 8986838 diff --git a/global_step674/layer_15-model_states.pt b/global_step674/layer_15-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f0612515247d699528a75f59bf604d1cbc215f2 --- /dev/null +++ b/global_step674/layer_15-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0eb2e1154592b776552d4b92c4c2834d0be36bcaa5a206f72e64c817b875479e +size 8986838 diff --git a/global_step674/layer_16-model_states.pt b/global_step674/layer_16-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0550824d922887d084c1f06159994806945086d0 --- /dev/null +++ b/global_step674/layer_16-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c939c3cac98fecdf76dddd5967eac8009b7e5b1187a61b2c008847fd1552c66 +size 8986838 diff --git a/global_step674/layer_17-model_states.pt b/global_step674/layer_17-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92d9d0eb2d76018df984b4883929e771d572c500 --- /dev/null +++ b/global_step674/layer_17-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb24eb390d7fc0f719482ab620bfde53d32eda4fe9b05181bc859d9b6b674840 +size 8986838 diff --git a/global_step674/layer_18-model_states.pt b/global_step674/layer_18-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39783adfb034d2c596506df74ca4933656d45cb1 --- /dev/null +++ b/global_step674/layer_18-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d8ddda98f67753305ad5c0a7318ee1cb5c005f2c6b9f372c5a0d799054ed51 +size 8986838 diff --git a/global_step674/layer_19-model_states.pt b/global_step674/layer_19-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..923fbc96a7bdb102e10a2d980a73280179d740ca --- /dev/null +++ b/global_step674/layer_19-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8728959df21091533e85c32dd016c2136a4e06abeeb404c0703e493b6d53fb33 +size 8986838 diff --git a/global_step674/layer_20-model_states.pt b/global_step674/layer_20-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26ae23c95e2b8624c30c4c946cfd9f9fc45e60b0 --- /dev/null +++ b/global_step674/layer_20-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b06a056b73cdaed0105f57c51ba3a628beb363d48d1ad97f665b3227c9d72d6f +size 8986838 diff --git a/global_step674/layer_21-model_states.pt b/global_step674/layer_21-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc8b713eee6a4e68097910be4e144a64fdf226a8 --- /dev/null +++ b/global_step674/layer_21-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1b19c9bc606c6755227c2e889f1144b20a6a085eda9b3a845dc2c6bbe4a2a62 +size 8986838 diff --git a/global_step674/layer_22-model_states.pt b/global_step674/layer_22-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0351e9f6473f4864e5227abdda51950a7e486d52 --- /dev/null +++ b/global_step674/layer_22-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fa8952f4256cf0a5ca4ea1fa835302e7c51008bd9321e0bd9159a5dd86f499e +size 8986838 diff --git a/global_step674/layer_23-model_states.pt b/global_step674/layer_23-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88631384d5a22419900f22229cc9fbb7a110a00c --- /dev/null +++ b/global_step674/layer_23-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0565f7e5d473a8e00d772d0afb1ea1fa2d9a59bceec3d4e5ec12b682bd0c1eb +size 8986838 diff --git a/global_step674/layer_24-model_states.pt b/global_step674/layer_24-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2c4ebf74d3f96c8f1f1ef38b97256195dabb0dd --- /dev/null +++ b/global_step674/layer_24-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37afec3691d7910b5eae3ac9637a17b4ebb1da93ade52d0d969233f41638b2c2 +size 8986838 diff --git a/global_step674/layer_25-model_states.pt b/global_step674/layer_25-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9702c2acfd21d4b126aa19f71674424eb571b514 --- /dev/null +++ b/global_step674/layer_25-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b7d0e2a8929ffd02b7d52d24ab88705ce96900fc74f7e063ef83f7ba5286a29 +size 8986838 diff --git a/global_step674/layer_26-model_states.pt b/global_step674/layer_26-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..575555fef3fbf1e2ab01fc5061fde04eed63cd24 --- /dev/null +++ b/global_step674/layer_26-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d00f420747b02a12a4afb067779d010b8bc75db7e9230985f4ad78080467225 +size 8986838 diff --git a/global_step674/layer_27-model_states.pt b/global_step674/layer_27-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21f9dc76893f6e4076c01ff298868b41512c2b42 --- /dev/null +++ b/global_step674/layer_27-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b8f8b2076ed49e9ac5b3760423a964b334b06cc918e2b2b25fef992932e511 +size 8986838 diff --git a/global_step674/layer_28-model_states.pt b/global_step674/layer_28-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b53410185efdc81064ff3892eba7e7b059ee9090 --- /dev/null +++ b/global_step674/layer_28-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfbc2e5e137cf577e40727bb4b954012a465862c25c186d43a4a22f7b6b230c9 +size 8986838 diff --git a/global_step674/layer_29-model_states.pt b/global_step674/layer_29-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e833650af00097cd8aba1f7cb91b60bb48ca7058 --- /dev/null +++ b/global_step674/layer_29-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cb00ce5547e390d7bf8526754d2b6c4be521af01952d833f502bb595bd076df +size 8986838 diff --git a/global_step674/layer_30-model_states.pt b/global_step674/layer_30-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1854c5f5acc6142713a055f2a3f7ebb2d3fa985 --- /dev/null +++ b/global_step674/layer_30-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18ca5ffa217daee5a521c7747447ab7b4ae73384e2d274d8d3f0d15a8b16917d +size 8986838 diff --git a/global_step674/layer_31-model_states.pt b/global_step674/layer_31-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86b3c236b2cb1e0c7cd3d3d7c62a8c3d96803e8f --- /dev/null +++ b/global_step674/layer_31-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:479bc2f7b53f94efb57264d1998ca691e7b63785dab335fce7c2c1bc06039656 +size 8986838 diff --git a/global_step674/layer_32-model_states.pt b/global_step674/layer_32-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f53334c016b8673b09e51bcd1dc46ec533689a5 --- /dev/null +++ b/global_step674/layer_32-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8fd868a11303b7bbd909f5069ca61643d9da74f7908eaaa2cf1310e65430823 +size 8986838 diff --git a/global_step674/layer_33-model_states.pt b/global_step674/layer_33-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a20de3c6673d6505d7eb32d1e39b86536b8ccdaf --- /dev/null +++ b/global_step674/layer_33-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:605f2f9298cd651972f01711fc281ed8c36f95562a2adc0d0f91ad626aa02cea +size 8986838 diff --git a/global_step674/layer_34-model_states.pt b/global_step674/layer_34-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02533b98c122a0091af293429d24fb20eeb7eace --- /dev/null +++ b/global_step674/layer_34-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ac784d04166c2ec2c63bf1a13e1172ee2b7509e275c828018be61af0b346e3b +size 8986838 diff --git a/global_step674/layer_35-model_states.pt b/global_step674/layer_35-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33ddc628d00152b4ca8b636084d974ae601d37a6 --- /dev/null +++ b/global_step674/layer_35-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9484dfdfb4cae79593fb2a517f312fb674b9fcae7ce10414473dcd4cf11703b4 +size 8986838 diff --git a/global_step674/layer_36-model_states.pt b/global_step674/layer_36-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..003f98ab707e24d71c4d589f65d2deea7c7b5869 --- /dev/null +++ b/global_step674/layer_36-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e839b352e8965e6432f07891304639693452eea36689d70cd3b3d69a709e1fac +size 8986838 diff --git a/global_step674/layer_37-model_states.pt b/global_step674/layer_37-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82a8f1d8ee11e81bad513f91266bf675ed4ef3d6 --- /dev/null +++ b/global_step674/layer_37-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:539586ee06f0eb3ad3176c534d867b65d20f7d0ed9b6a9de05ec020c9bec0569 +size 8986838 diff --git a/global_step674/layer_38-model_states.pt b/global_step674/layer_38-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a66a82c464e4165551735effd41fe778339a2025 --- /dev/null +++ b/global_step674/layer_38-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab3156985821b4db9f802c99329ada351d58732267b8a01a6dd6673bc982d0bd +size 8986838 diff --git a/global_step674/layer_39-model_states.pt b/global_step674/layer_39-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1591274a438458fb29272e1e9544c65b278c8ffd --- /dev/null +++ b/global_step674/layer_39-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:600964ba8de62bdaa99d76705faaeea4bfd70c051d5749b573c08f678e0fa3f1 +size 8986838 diff --git a/global_step674/layer_40-model_states.pt b/global_step674/layer_40-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3a8a1ab0592aa84c9b5d547e7fc402283f0d721 --- /dev/null +++ b/global_step674/layer_40-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc4a882ff5e147570bbec0d21f86a853dd65954713b414ae6e55b0d6dd36af65 +size 8986838 diff --git a/global_step674/layer_41-model_states.pt b/global_step674/layer_41-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fd39c77c38740b902e047900c7b8d0060c5dab2 --- /dev/null +++ b/global_step674/layer_41-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7cd44b9d5e66a50fda4fcfc9b19b410cfcb8f8bb3e5334cbb02f47228905913 +size 920 diff --git a/global_step674/mp_rank_00_model_states.pt b/global_step674/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b9705bd2cf2065ee44cdaff96c621fec85a0129 --- /dev/null +++ b/global_step674/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02f814e8c94c11a3d47dac60c19d9244d7c9e9019a8c4e7c43bbb39be14088fc +size 1078301785 diff --git a/latest b/latest new file mode 100644 index 0000000000000000000000000000000000000000..9390b9fafd2c25d33bf50e50e86e491617bd18e6 --- /dev/null +++ b/latest @@ -0,0 +1 @@ +global_step3366 \ No newline at end of file diff --git a/wan.toml b/wan.toml new file mode 100644 index 0000000000000000000000000000000000000000..1f2afb3451b60505221f302beb422f2db801526d --- /dev/null +++ b/wan.toml @@ -0,0 +1,48 @@ +output_dir = "/workspace/ComfyUI/models/loras/out" +dataset = "/workspace/configs/dataset_wan.toml" +epochs = 1000 +micro_batch_size_per_gpu = 1 +pipeline_stages = 1 +gradient_accumulation_steps = 1 +gradient_clipping = 1.0 +warmup_steps = 40 +activation_checkpointing = true +partition_method = "parameters" +save_dtype = "bfloat16" +caching_batch_size = 1 +steps_per_print = 1 +video_clip_mode = "single_beginning" +save_every_n_epochs = 10 +checkpoint_every_n_minutes = 120 +blocks_to_swap = 20 + +eval_every_n_epochs = 1 +eval_before_first_step = true +eval_micro_batch_size_per_gpu = 1 +eval_gradient_accumulation_steps = 1 + +[model] +type = "wan" +ckpt_path = "/workspace/Wan2.1" +transformer_path = '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors' +llm_path = '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors' +dtype = "bfloat16" +timestep_sample_method = "logit_normal" + +[adapter] +type = "lora" +rank = 32 +dtype = "bfloat16" + +[optimizer] +type = "adamw_optimi" +lr = 1e-5 +betas = [ 0.9, 0.99,] +weight_decay = 0.01 + +[monitoring] +# Set to true and fill in these fields to enable wandb +enable_wandb = true +wandb_api_key = 'f46df1bb828b735bd22f94fff1be190ba5e046f9' +wandb_tracker_name = 'wan-lora' +wandb_run_name = 'wan-lora' diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..203156c8edb0f901705f25fa9530914c25d8348a --- /dev/null +++ b/wandb/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-05-05T19:10:35.753351658Z","level":"INFO","msg":"stream: starting","core version":"0.19.10","symlink path":"/workspace/ComfyUI/models/loras/out/20250505_19-10-35/wandb/run-20250505_191035-lg5j0rns/logs/debug-core.log"} +{"time":"2025-05-05T19:10:35.966072548Z","level":"INFO","msg":"created new stream","id":"lg5j0rns"} +{"time":"2025-05-05T19:10:35.966148127Z","level":"INFO","msg":"stream: started","id":"lg5j0rns"} +{"time":"2025-05-05T19:10:35.966212213Z","level":"INFO","msg":"writer: Do: started","stream_id":"lg5j0rns"} +{"time":"2025-05-05T19:10:35.966242016Z","level":"INFO","msg":"handler: started","stream_id":"lg5j0rns"} +{"time":"2025-05-05T19:10:35.966521812Z","level":"INFO","msg":"sender: started","stream_id":"lg5j0rns"} +{"time":"2025-05-05T19:10:36.117361836Z","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/debug.log b/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2f39708c0bfaede2fc1a8259b7811f14ff9447b3 --- /dev/null +++ b/wandb/debug.log @@ -0,0 +1,22 @@ +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_setup.py:_flush():68] Current SDK version is 0.19.10 +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_setup.py:_flush():68] Configure stats pid to 20093 +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_setup.py:_flush():68] Loading settings from /root/.config/wandb/settings +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_setup.py:_flush():68] Loading settings from /workspace/diffusion-pipe/wandb/settings +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_setup.py:_flush():68] Loading settings from environment variables +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /workspace/ComfyUI/models/loras/out/20250505_19-10-35/wandb/run-20250505_191035-lg5j0rns/logs/debug.log +2025-05-05 19:10:35,745 INFO MainThread:20093 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /workspace/ComfyUI/models/loras/out/20250505_19-10-35/wandb/run-20250505_191035-lg5j0rns/logs/debug-internal.log +2025-05-05 19:10:35,745 INFO MainThread:20093 [wandb_init.py:init():852] calling init triggers +2025-05-05 19:10:35,745 INFO MainThread:20093 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'output_dir': '/workspace/ComfyUI/models/loras/out', 'dataset': '/workspace/configs/dataset_wan.toml', 'epochs': 1000, 'micro_batch_size_per_gpu': 1, 'pipeline_stages': 1, 'gradient_accumulation_steps': 1, 'gradient_clipping': 1.0, 'warmup_steps': 40, 'activation_checkpointing': True, 'partition_method': 'parameters', 'save_dtype': torch.bfloat16, 'caching_batch_size': 1, 'steps_per_print': 1, 'video_clip_mode': 'single_beginning', 'save_every_n_epochs': 10, 'checkpoint_every_n_minutes': 120, 'blocks_to_swap': 20, 'eval_every_n_epochs': 1, 'eval_before_first_step': True, 'eval_micro_batch_size_per_gpu': 1, 'eval_gradient_accumulation_steps': 1, 'model': {'type': 'wan', 'ckpt_path': '/workspace/Wan2.1', 'transformer_path': '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors', 'llm_path': '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors', 'dtype': torch.bfloat16, 'timestep_sample_method': 'logit_normal', 'guidance': 1.0}, 'adapter': {'type': 'lora', 'rank': 32, 'dtype': torch.bfloat16, 'alpha': 32, 'dropout': 0.0}, 'optimizer': {'type': 'adamw_optimi', 'lr': 1e-05, 'betas': [0.9, 0.99], 'weight_decay': 0.01}, 'monitoring': {'enable_wandb': True, 'wandb_api_key': 'f46df1bb828b735bd22f94fff1be190ba5e046f9', 'wandb_tracker_name': 'wan-lora', 'wandb_run_name': 'wan-lora'}, 'reentrant_activation_checkpointing': False, 'logging_steps': 1, 'eval_datasets': [], 'eval_every_n_steps': None, '_wandb': {}} +2025-05-05 19:10:35,745 INFO MainThread:20093 [wandb_init.py:init():893] starting backend +2025-05-05 19:10:35,745 INFO MainThread:20093 [wandb_init.py:init():897] sending inform_init request +2025-05-05 19:10:35,748 INFO MainThread:20093 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-05 19:10:35,749 INFO MainThread:20093 [wandb_init.py:init():907] backend started and connected +2025-05-05 19:10:35,751 INFO MainThread:20093 [wandb_init.py:init():1002] updated telemetry +2025-05-05 19:10:35,759 INFO MainThread:20093 [wandb_init.py:init():1026] communicating run to backend with 90.0 second timeout +2025-05-05 19:10:36,112 INFO MainThread:20093 [wandb_init.py:init():1101] starting run threads in backend +2025-05-05 19:10:36,318 INFO MainThread:20093 [wandb_run.py:_console_start():2566] atexit reg +2025-05-05 19:10:36,319 INFO MainThread:20093 [wandb_run.py:_redirect():2414] redirect: wrap_raw +2025-05-05 19:10:36,319 INFO MainThread:20093 [wandb_run.py:_redirect():2483] Wrapping output streams. +2025-05-05 19:10:36,320 INFO MainThread:20093 [wandb_run.py:_redirect():2506] Redirects installed. +2025-05-05 19:10:36,324 INFO MainThread:20093 [wandb_init.py:init():1147] run started, returning control to user process diff --git a/wandb/run-20250505_191035-lg5j0rns/files/output.log b/wandb/run-20250505_191035-lg5j0rns/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a19fe609b6d31e84fa890c183dc473d67b058e71 --- /dev/null +++ b/wandb/run-20250505_191035-lg5j0rns/files/output.log @@ -0,0 +1,8556 @@ +Block swap enabled. Swapping 20 blocks out of 40 blocks. +SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +Using topology: {ProcessCoord(pipe=0, data=0): 0} +[2025-05-05 19:11:11,965] [INFO] [module.py:398:_partition_layers] Partitioning pipeline stages with method parameters +stage=0 layers=42 + 0: InitialLayer + 1: TransformerLayer + 2: TransformerLayer + 3: TransformerLayer + 4: TransformerLayer + 5: TransformerLayer + 6: TransformerLayer + 7: TransformerLayer + 8: TransformerLayer + 9: TransformerLayer + 10: TransformerLayer + 11: TransformerLayer + 12: TransformerLayer + 13: TransformerLayer + 14: TransformerLayer + 15: TransformerLayer + 16: TransformerLayer + 17: TransformerLayer + 18: TransformerLayer + 19: TransformerLayer + 20: TransformerLayer + 21: TransformerLayer + 22: TransformerLayer + 23: TransformerLayer + 24: TransformerLayer + 25: TransformerLayer + 26: TransformerLayer + 27: TransformerLayer + 28: TransformerLayer + 29: TransformerLayer + 30: TransformerLayer + 31: TransformerLayer + 32: TransformerLayer + 33: TransformerLayer + 34: TransformerLayer + 35: TransformerLayer + 36: TransformerLayer + 37: TransformerLayer + 38: TransformerLayer + 39: TransformerLayer + 40: TransformerLayer + 41: FinalLayer + loss: loss_fn +[2025-05-05 19:11:12,034] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed info: version=0.16.7, git-hash=unknown, git-branch=unknown +[2025-05-05 19:11:12,034] [INFO] [config.py:735:__init__] Config mesh_device None world_size = 1 +[2025-05-05 19:11:12,209] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2025-05-05 19:11:12,214] [INFO] [logging.py:107:log_dist] [Rank 0] Using client callable to create basic optimizer +[2025-05-05 19:11:12,214] [INFO] [logging.py:107:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2025-05-05 19:11:12,404] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW +[2025-05-05 19:11:12,405] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW +[2025-05-05 19:11:12,405] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed using configured LR scheduler = None +[2025-05-05 19:11:12,405] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2025-05-05 19:11:12,405] [INFO] [logging.py:107:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-05], mom=[0.0] +[2025-05-05 19:11:12,421] [INFO] [config.py:1003:print] DeepSpeedEngine configuration: +[2025-05-05 19:11:12,421] [INFO] [config.py:1007:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-05-05 19:11:12,422] [INFO] [config.py:1007:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'intra_op_parallelism': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-05-05 19:11:12,422] [INFO] [config.py:1007:print] amp_enabled .................. False +[2025-05-05 19:11:12,422] [INFO] [config.py:1007:print] amp_params ................... False +[2025-05-05 19:11:12,422] [INFO] [config.py:1007:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-05-05 19:11:12,422] [INFO] [config.py:1007:print] bfloat16_enabled ............. False +[2025-05-05 19:11:12,422] [INFO] [config.py:1007:print] bfloat16_immediate_grad_update False +[2025-05-05 19:11:12,422] [INFO] [config.py:1007:print] checkpoint_parallel_write_pipeline False +[2025-05-05 19:11:12,422] [INFO] [config.py:1007:print] checkpoint_tag_validation_enabled True +[2025-05-05 19:11:12,423] [INFO] [config.py:1007:print] checkpoint_tag_validation_fail False +[2025-05-05 19:11:12,423] [INFO] [config.py:1007:print] comms_config ................. +[2025-05-05 19:11:12,423] [INFO] [config.py:1007:print] communication_data_type ...... None +[2025-05-05 19:11:12,423] [INFO] [config.py:1007:print] compile_config ............... deepcompile=False free_activation=False offload_activation=False offload_opt_states=False double_buffer=True symmetric_memory=False debug_log=False offload_parameters=False sync_before_reduce=False sync_after_reduce=False sync_before_allgather=False sync_after_allgather=False +[2025-05-05 19:11:12,423] [INFO] [config.py:1007:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-05-05 19:11:12,423] [INFO] [config.py:1007:print] curriculum_enabled_legacy .... False +[2025-05-05 19:11:12,423] [INFO] [config.py:1007:print] curriculum_params_legacy ..... False +[2025-05-05 19:11:12,423] [INFO] [config.py:1007:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'pin_memory': False, 'curriculum_learning': {'enabled': False}, 'dynamic_batching': {'enabled': False, 'lr_scaling_method': 'linear', 'min_batch_size': 1, 'max_batch_size': None, 'sequence_picking_order': 'dataloader', 'verbose': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-05-05 19:11:12,424] [INFO] [config.py:1007:print] data_efficiency_enabled ...... False +[2025-05-05 19:11:12,424] [INFO] [config.py:1007:print] dataloader_drop_last ......... False +[2025-05-05 19:11:12,424] [INFO] [config.py:1007:print] disable_allgather ............ False +[2025-05-05 19:11:12,424] [INFO] [config.py:1007:print] dump_state ................... False +[2025-05-05 19:11:12,424] [INFO] [config.py:1007:print] dynamic_loss_scale_args ...... None +[2025-05-05 19:11:12,424] [INFO] [config.py:1007:print] eigenvalue_enabled ........... False +[2025-05-05 19:11:12,424] [INFO] [config.py:1007:print] eigenvalue_gas_boundary_resolution 1 +[2025-05-05 19:11:12,424] [INFO] [config.py:1007:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-05-05 19:11:12,424] [INFO] [config.py:1007:print] eigenvalue_layer_num ......... 0 +[2025-05-05 19:11:12,424] [INFO] [config.py:1007:print] eigenvalue_max_iter .......... 100 +[2025-05-05 19:11:12,425] [INFO] [config.py:1007:print] eigenvalue_stability ......... 1e-06 +[2025-05-05 19:11:12,425] [INFO] [config.py:1007:print] eigenvalue_tol ............... 0.01 +[2025-05-05 19:11:12,425] [INFO] [config.py:1007:print] eigenvalue_verbose ........... False +[2025-05-05 19:11:12,425] [INFO] [config.py:1007:print] elasticity_enabled ........... False +[2025-05-05 19:11:12,425] [INFO] [config.py:1007:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-05-05 19:11:12,425] [INFO] [config.py:1007:print] fp16_auto_cast ............... None +[2025-05-05 19:11:12,425] [INFO] [config.py:1007:print] fp16_enabled ................. False +[2025-05-05 19:11:12,425] [INFO] [config.py:1007:print] fp16_master_weights_and_gradients False +[2025-05-05 19:11:12,425] [INFO] [config.py:1007:print] global_rank .................. 0 +[2025-05-05 19:11:12,426] [INFO] [config.py:1007:print] grad_accum_dtype ............. None +[2025-05-05 19:11:12,426] [INFO] [config.py:1007:print] gradient_accumulation_steps .. 1 +[2025-05-05 19:11:12,426] [INFO] [config.py:1007:print] gradient_clipping ............ 1.0 +[2025-05-05 19:11:12,426] [INFO] [config.py:1007:print] gradient_predivide_factor .... 1.0 +[2025-05-05 19:11:12,426] [INFO] [config.py:1007:print] graph_harvesting ............. False +[2025-05-05 19:11:12,426] [INFO] [config.py:1007:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-05-05 19:11:12,426] [INFO] [config.py:1007:print] initial_dynamic_scale ........ 65536 +[2025-05-05 19:11:12,426] [INFO] [config.py:1007:print] load_universal_checkpoint .... False +[2025-05-05 19:11:12,426] [INFO] [config.py:1007:print] loss_scale ................... 0 +[2025-05-05 19:11:12,427] [INFO] [config.py:1007:print] memory_breakdown ............. False +[2025-05-05 19:11:12,427] [INFO] [config.py:1007:print] mics_hierarchial_params_gather False +[2025-05-05 19:11:12,427] [INFO] [config.py:1007:print] mics_shard_size .............. -1 +[2025-05-05 19:11:12,427] [INFO] [config.py:1007:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-05-05 19:11:12,427] [INFO] [config.py:1007:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-05-05 19:11:12,427] [INFO] [config.py:1007:print] optimizer_legacy_fusion ...... False +[2025-05-05 19:11:12,427] [INFO] [config.py:1007:print] optimizer_name ............... None +[2025-05-05 19:11:12,427] [INFO] [config.py:1007:print] optimizer_params ............. None +[2025-05-05 19:11:12,428] [INFO] [config.py:1007:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-05-05 19:11:12,428] [INFO] [config.py:1007:print] pld_enabled .................. False +[2025-05-05 19:11:12,428] [INFO] [config.py:1007:print] pld_params ................... False +[2025-05-05 19:11:12,428] [INFO] [config.py:1007:print] prescale_gradients ........... False +[2025-05-05 19:11:12,428] [INFO] [config.py:1007:print] scheduler_name ............... None +[2025-05-05 19:11:12,428] [INFO] [config.py:1007:print] scheduler_params ............. None +[2025-05-05 19:11:12,428] [INFO] [config.py:1007:print] seq_parallel_communication_data_type torch.float32 +[2025-05-05 19:11:12,428] [INFO] [config.py:1007:print] sparse_attention ............. None +[2025-05-05 19:11:12,428] [INFO] [config.py:1007:print] sparse_gradients_enabled ..... False +[2025-05-05 19:11:12,428] [INFO] [config.py:1007:print] steps_per_print .............. 1 +[2025-05-05 19:11:12,429] [INFO] [config.py:1007:print] tensor_parallel_config ....... dtype=torch.float16 autotp_size=0 tp_overlap_comm=False tensor_parallel=TPConfig(tp_size=1, tp_grain_size=1, mpu=None, tp_group=None) injection_policy_tuple=None keep_module_on_host=False replace_with_kernel_inject=False +[2025-05-05 19:11:12,429] [INFO] [config.py:1007:print] timers_config ................ enabled=True synchronized=True +[2025-05-05 19:11:12,429] [INFO] [config.py:1007:print] train_batch_size ............. 1 +[2025-05-05 19:11:12,429] [INFO] [config.py:1007:print] train_micro_batch_size_per_gpu 1 +[2025-05-05 19:11:12,429] [INFO] [config.py:1007:print] use_data_before_expert_parallel_ False +[2025-05-05 19:11:12,429] [INFO] [config.py:1007:print] use_node_local_storage ....... False +[2025-05-05 19:11:12,429] [INFO] [config.py:1007:print] wall_clock_breakdown ......... False +[2025-05-05 19:11:12,429] [INFO] [config.py:1007:print] weight_quantization_config ... None +[2025-05-05 19:11:12,430] [INFO] [config.py:1007:print] world_size ................... 1 +[2025-05-05 19:11:12,430] [INFO] [config.py:1007:print] zero_allow_untested_optimizer False +[2025-05-05 19:11:12,430] [INFO] [config.py:1007:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True log_trace_cache_warnings=False +[2025-05-05 19:11:12,430] [INFO] [config.py:1007:print] zero_enabled ................. False +[2025-05-05 19:11:12,430] [INFO] [config.py:1007:print] zero_force_ds_cpu_optimizer .. True +[2025-05-05 19:11:12,430] [INFO] [config.py:1007:print] zero_optimization_stage ...... 0 +[2025-05-05 19:11:12,430] [INFO] [config.py:993:print_user_config] json = { + "train_micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 1, + "gradient_clipping": 1.0, + "steps_per_print": 1 +} +[2025-05-05 19:11:12,431] [INFO] [engine.py:105:__init__] CONFIG: micro_batches=1 micro_batch_size=1 +[2025-05-05 19:11:12,431] [INFO] [engine.py:146:__init__] is_pipe_partitioned= False is_grad_partitioned= False +[2025-05-05 19:11:12,476] [INFO] [engine.py:165:__init__] RANK=0 STAGE=0 LAYERS=42 [0, 42) STAGE_PARAMS=179568640 (179.569M) TOTAL_PARAMS=179568640 (179.569M) UNIQUE_PARAMS=179568640 (179.569M) +[2025-05-05 19:11:25,285] [INFO] [logging.py:107:log_dist] [Rank 0] step=1, skipped=0, lr=[4.937500000000001e-07], mom=[0.0] +steps: 1 loss: 0.2089 iter time (s): 12.296 samples/sec: 0.081 +[2025-05-05 19:11:35,955] [INFO] [logging.py:107:log_dist] [Rank 0] step=2, skipped=0, lr=[7.375000000000002e-07], mom=[0.0] +steps: 2 loss: 0.0643 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-05 19:11:46,621] [INFO] [logging.py:107:log_dist] [Rank 0] step=3, skipped=0, lr=[9.812500000000003e-07], mom=[0.0] +steps: 3 loss: 0.0596 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-05 19:11:57,278] [INFO] [logging.py:107:log_dist] [Rank 0] step=4, skipped=0, lr=[1.2250000000000003e-06], mom=[0.0] +steps: 4 loss: 0.2443 iter time (s): 10.621 samples/sec: 0.094 +[2025-05-05 19:12:07,938] [INFO] [logging.py:107:log_dist] [Rank 0] step=5, skipped=0, lr=[1.4687500000000005e-06], mom=[0.0] +steps: 5 loss: 0.0723 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-05 19:12:18,606] [INFO] [logging.py:107:log_dist] [Rank 0] step=6, skipped=0, lr=[1.7125000000000005e-06], mom=[0.0] +steps: 6 loss: 0.0367 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:12:29,269] [INFO] [logging.py:107:log_dist] [Rank 0] step=7, skipped=0, lr=[1.956250000000001e-06], mom=[0.0] +steps: 7 loss: 0.0881 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-05 19:12:40,195] [INFO] [logging.py:107:log_dist] [Rank 0] step=8, skipped=0, lr=[2.200000000000001e-06], mom=[0.0] +steps: 8 loss: 0.0546 iter time (s): 10.896 samples/sec: 0.092 +[2025-05-05 19:12:50,860] [INFO] [logging.py:107:log_dist] [Rank 0] step=9, skipped=0, lr=[2.443750000000001e-06], mom=[0.0] +steps: 9 loss: 0.0444 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-05 19:13:01,527] [INFO] [logging.py:107:log_dist] [Rank 0] step=10, skipped=0, lr=[2.6875000000000014e-06], mom=[0.0] +steps: 10 loss: 0.0355 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-05 19:13:12,195] [INFO] [logging.py:107:log_dist] [Rank 0] step=11, skipped=0, lr=[2.9312500000000014e-06], mom=[0.0] +steps: 11 loss: 0.0597 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:13:22,865] [INFO] [logging.py:107:log_dist] [Rank 0] step=12, skipped=0, lr=[3.1750000000000014e-06], mom=[0.0] +steps: 12 loss: 0.0788 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:13:33,535] [INFO] [logging.py:107:log_dist] [Rank 0] step=13, skipped=0, lr=[3.4187500000000018e-06], mom=[0.0] +steps: 13 loss: 0.0471 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:13:44,200] [INFO] [logging.py:107:log_dist] [Rank 0] step=14, skipped=0, lr=[3.6625000000000018e-06], mom=[0.0] +steps: 14 loss: 0.0379 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 19:13:54,859] [INFO] [logging.py:107:log_dist] [Rank 0] step=15, skipped=0, lr=[3.906250000000002e-06], mom=[0.0] +steps: 15 loss: 0.0511 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-05 19:14:05,524] [INFO] [logging.py:107:log_dist] [Rank 0] step=16, skipped=0, lr=[4.150000000000002e-06], mom=[0.0] +steps: 16 loss: 0.0843 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-05 19:14:16,358] [INFO] [logging.py:107:log_dist] [Rank 0] step=17, skipped=0, lr=[4.393750000000002e-06], mom=[0.0] +steps: 17 loss: 0.0438 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-05 19:14:27,023] [INFO] [logging.py:107:log_dist] [Rank 0] step=18, skipped=0, lr=[4.637500000000002e-06], mom=[0.0] +steps: 18 loss: 0.1536 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 19:14:37,686] [INFO] [logging.py:107:log_dist] [Rank 0] step=19, skipped=0, lr=[4.881250000000002e-06], mom=[0.0] +steps: 19 loss: 0.1413 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-05 19:14:48,353] [INFO] [logging.py:107:log_dist] [Rank 0] step=20, skipped=0, lr=[5.125000000000001e-06], mom=[0.0] +steps: 20 loss: 0.1632 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:14:59,014] [INFO] [logging.py:107:log_dist] [Rank 0] step=21, skipped=0, lr=[5.368750000000001e-06], mom=[0.0] +steps: 21 loss: 0.0434 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-05 19:15:09,680] [INFO] [logging.py:107:log_dist] [Rank 0] step=22, skipped=0, lr=[5.612500000000001e-06], mom=[0.0] +steps: 22 loss: 0.0695 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:15:20,351] [INFO] [logging.py:107:log_dist] [Rank 0] step=23, skipped=0, lr=[5.85625e-06], mom=[0.0] +steps: 23 loss: 0.0631 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:15:31,013] [INFO] [logging.py:107:log_dist] [Rank 0] step=24, skipped=0, lr=[6.1e-06], mom=[0.0] +steps: 24 loss: 0.0587 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-05 19:15:41,681] [INFO] [logging.py:107:log_dist] [Rank 0] step=25, skipped=0, lr=[6.34375e-06], mom=[0.0] +steps: 25 loss: 0.0747 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:15:52,511] [INFO] [logging.py:107:log_dist] [Rank 0] step=26, skipped=0, lr=[6.587500000000001e-06], mom=[0.0] +steps: 26 loss: 0.0569 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-05 19:16:03,174] [INFO] [logging.py:107:log_dist] [Rank 0] step=27, skipped=0, lr=[6.831250000000002e-06], mom=[0.0] +steps: 27 loss: 0.0618 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-05 19:16:13,838] [INFO] [logging.py:107:log_dist] [Rank 0] step=28, skipped=0, lr=[7.075000000000002e-06], mom=[0.0] +steps: 28 loss: 0.1613 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-05 19:16:24,505] [INFO] [logging.py:107:log_dist] [Rank 0] step=29, skipped=0, lr=[7.318750000000002e-06], mom=[0.0] +steps: 29 loss: 0.0631 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:16:35,168] [INFO] [logging.py:107:log_dist] [Rank 0] step=30, skipped=0, lr=[7.562500000000002e-06], mom=[0.0] +steps: 30 loss: 0.1002 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-05 19:16:45,842] [INFO] [logging.py:107:log_dist] [Rank 0] step=31, skipped=0, lr=[7.806250000000002e-06], mom=[0.0] +steps: 31 loss: 0.0476 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:16:56,510] [INFO] [logging.py:107:log_dist] [Rank 0] step=32, skipped=0, lr=[8.050000000000003e-06], mom=[0.0] +steps: 32 loss: 0.2248 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:17:07,175] [INFO] [logging.py:107:log_dist] [Rank 0] step=33, skipped=0, lr=[8.293750000000002e-06], mom=[0.0] +steps: 33 loss: 0.0427 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 19:17:18,008] [INFO] [logging.py:107:log_dist] [Rank 0] step=34, skipped=0, lr=[8.5375e-06], mom=[0.0] +steps: 34 loss: 0.2641 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-05 19:17:28,682] [INFO] [logging.py:107:log_dist] [Rank 0] step=35, skipped=0, lr=[8.78125e-06], mom=[0.0] +steps: 35 loss: 0.0868 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:17:39,348] [INFO] [logging.py:107:log_dist] [Rank 0] step=36, skipped=0, lr=[9.024999999999999e-06], mom=[0.0] +steps: 36 loss: 0.0658 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:17:50,015] [INFO] [logging.py:107:log_dist] [Rank 0] step=37, skipped=0, lr=[9.26875e-06], mom=[0.0] +steps: 37 loss: 0.2642 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:18:00,685] [INFO] [logging.py:107:log_dist] [Rank 0] step=38, skipped=0, lr=[9.512499999999999e-06], mom=[0.0] +steps: 38 loss: 0.1312 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:18:11,358] [INFO] [logging.py:107:log_dist] [Rank 0] step=39, skipped=0, lr=[9.756249999999998e-06], mom=[0.0] +steps: 39 loss: 0.0507 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:18:22,024] [INFO] [logging.py:107:log_dist] [Rank 0] step=40, skipped=0, lr=[1e-05], mom=[0.0] +steps: 40 loss: 0.2091 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:18:32,685] [INFO] [logging.py:107:log_dist] [Rank 0] step=41, skipped=0, lr=[1e-05], mom=[0.0] +steps: 41 loss: 0.0715 iter time (s): 10.633 samples/sec: 0.094 +Started new epoch: 2 +[2025-05-05 19:18:43,857] [INFO] [logging.py:107:log_dist] [Rank 0] step=42, skipped=0, lr=[1e-05], mom=[0.0] +steps: 42 loss: 0.0869 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 19:18:54,531] [INFO] [logging.py:107:log_dist] [Rank 0] step=43, skipped=0, lr=[1e-05], mom=[0.0] +steps: 43 loss: 0.1037 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:19:05,199] [INFO] [logging.py:107:log_dist] [Rank 0] step=44, skipped=0, lr=[1e-05], mom=[0.0] +steps: 44 loss: 0.0810 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:19:15,867] [INFO] [logging.py:107:log_dist] [Rank 0] step=45, skipped=0, lr=[1e-05], mom=[0.0] +steps: 45 loss: 0.0383 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:19:26,538] [INFO] [logging.py:107:log_dist] [Rank 0] step=46, skipped=0, lr=[1e-05], mom=[0.0] +steps: 46 loss: 0.1184 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:19:37,203] [INFO] [logging.py:107:log_dist] [Rank 0] step=47, skipped=0, lr=[1e-05], mom=[0.0] +steps: 47 loss: 0.0590 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 19:19:47,869] [INFO] [logging.py:107:log_dist] [Rank 0] step=48, skipped=0, lr=[1e-05], mom=[0.0] +steps: 48 loss: 0.0831 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:19:58,538] [INFO] [logging.py:107:log_dist] [Rank 0] step=49, skipped=0, lr=[1e-05], mom=[0.0] +steps: 49 loss: 0.0923 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:20:09,207] [INFO] [logging.py:107:log_dist] [Rank 0] step=50, skipped=0, lr=[1e-05], mom=[0.0] +steps: 50 loss: 0.2141 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:20:20,061] [INFO] [logging.py:107:log_dist] [Rank 0] step=51, skipped=0, lr=[1e-05], mom=[0.0] +steps: 51 loss: 0.0691 iter time (s): 10.824 samples/sec: 0.092 +[2025-05-05 19:20:30,731] [INFO] [logging.py:107:log_dist] [Rank 0] step=52, skipped=0, lr=[1e-05], mom=[0.0] +steps: 52 loss: 0.1507 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:20:41,400] [INFO] [logging.py:107:log_dist] [Rank 0] step=53, skipped=0, lr=[1e-05], mom=[0.0] +steps: 53 loss: 0.0389 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:20:52,068] [INFO] [logging.py:107:log_dist] [Rank 0] step=54, skipped=0, lr=[1e-05], mom=[0.0] +steps: 54 loss: 0.0560 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:21:02,736] [INFO] [logging.py:107:log_dist] [Rank 0] step=55, skipped=0, lr=[1e-05], mom=[0.0] +steps: 55 loss: 0.0683 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:21:13,411] [INFO] [logging.py:107:log_dist] [Rank 0] step=56, skipped=0, lr=[1e-05], mom=[0.0] +steps: 56 loss: 0.0316 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:21:24,080] [INFO] [logging.py:107:log_dist] [Rank 0] step=57, skipped=0, lr=[1e-05], mom=[0.0] +steps: 57 loss: 0.0755 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:21:34,754] [INFO] [logging.py:107:log_dist] [Rank 0] step=58, skipped=0, lr=[1e-05], mom=[0.0] +steps: 58 loss: 0.0405 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:21:45,591] [INFO] [logging.py:107:log_dist] [Rank 0] step=59, skipped=0, lr=[1e-05], mom=[0.0] +steps: 59 loss: 0.1052 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 19:21:56,259] [INFO] [logging.py:107:log_dist] [Rank 0] step=60, skipped=0, lr=[1e-05], mom=[0.0] +steps: 60 loss: 0.0963 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:22:06,929] [INFO] [logging.py:107:log_dist] [Rank 0] step=61, skipped=0, lr=[1e-05], mom=[0.0] +steps: 61 loss: 0.0969 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:22:17,598] [INFO] [logging.py:107:log_dist] [Rank 0] step=62, skipped=0, lr=[1e-05], mom=[0.0] +steps: 62 loss: 0.0554 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:22:28,269] [INFO] [logging.py:107:log_dist] [Rank 0] step=63, skipped=0, lr=[1e-05], mom=[0.0] +steps: 63 loss: 0.0707 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:22:38,943] [INFO] [logging.py:107:log_dist] [Rank 0] step=64, skipped=0, lr=[1e-05], mom=[0.0] +steps: 64 loss: 0.0383 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:22:49,611] [INFO] [logging.py:107:log_dist] [Rank 0] step=65, skipped=0, lr=[1e-05], mom=[0.0] +steps: 65 loss: 0.0496 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:23:00,277] [INFO] [logging.py:107:log_dist] [Rank 0] step=66, skipped=0, lr=[1e-05], mom=[0.0] +steps: 66 loss: 0.2025 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:23:10,949] [INFO] [logging.py:107:log_dist] [Rank 0] step=67, skipped=0, lr=[1e-05], mom=[0.0] +steps: 67 loss: 0.0449 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:23:21,780] [INFO] [logging.py:107:log_dist] [Rank 0] step=68, skipped=0, lr=[1e-05], mom=[0.0] +steps: 68 loss: 0.0454 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-05 19:23:32,448] [INFO] [logging.py:107:log_dist] [Rank 0] step=69, skipped=0, lr=[1e-05], mom=[0.0] +steps: 69 loss: 0.0563 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:23:43,119] [INFO] [logging.py:107:log_dist] [Rank 0] step=70, skipped=0, lr=[1e-05], mom=[0.0] +steps: 70 loss: 0.1996 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:23:53,785] [INFO] [logging.py:107:log_dist] [Rank 0] step=71, skipped=0, lr=[1e-05], mom=[0.0] +steps: 71 loss: 0.0539 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:24:04,452] [INFO] [logging.py:107:log_dist] [Rank 0] step=72, skipped=0, lr=[1e-05], mom=[0.0] +steps: 72 loss: 0.2633 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:24:15,122] [INFO] [logging.py:107:log_dist] [Rank 0] step=73, skipped=0, lr=[1e-05], mom=[0.0] +steps: 73 loss: 0.4052 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:24:25,793] [INFO] [logging.py:107:log_dist] [Rank 0] step=74, skipped=0, lr=[1e-05], mom=[0.0] +steps: 74 loss: 0.0885 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:24:36,464] [INFO] [logging.py:107:log_dist] [Rank 0] step=75, skipped=0, lr=[1e-05], mom=[0.0] +steps: 75 loss: 0.3034 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:24:47,297] [INFO] [logging.py:107:log_dist] [Rank 0] step=76, skipped=0, lr=[1e-05], mom=[0.0] +steps: 76 loss: 0.1539 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 19:24:57,963] [INFO] [logging.py:107:log_dist] [Rank 0] step=77, skipped=0, lr=[1e-05], mom=[0.0] +steps: 77 loss: 0.0672 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:25:08,638] [INFO] [logging.py:107:log_dist] [Rank 0] step=78, skipped=0, lr=[1e-05], mom=[0.0] +steps: 78 loss: 0.0360 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:25:19,306] [INFO] [logging.py:107:log_dist] [Rank 0] step=79, skipped=0, lr=[1e-05], mom=[0.0] +steps: 79 loss: 0.0868 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:25:29,972] [INFO] [logging.py:107:log_dist] [Rank 0] step=80, skipped=0, lr=[1e-05], mom=[0.0] +steps: 80 loss: 0.0556 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:25:40,643] [INFO] [logging.py:107:log_dist] [Rank 0] step=81, skipped=0, lr=[1e-05], mom=[0.0] +steps: 81 loss: 0.0383 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:25:51,308] [INFO] [logging.py:107:log_dist] [Rank 0] step=82, skipped=0, lr=[1e-05], mom=[0.0] +steps: 82 loss: 0.1174 iter time (s): 10.638 samples/sec: 0.094 +Started new epoch: 3 +[2025-05-05 19:26:02,325] [INFO] [logging.py:107:log_dist] [Rank 0] step=83, skipped=0, lr=[1e-05], mom=[0.0] +steps: 83 loss: 0.0486 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:26:12,999] [INFO] [logging.py:107:log_dist] [Rank 0] step=84, skipped=0, lr=[1e-05], mom=[0.0] +steps: 84 loss: 0.0491 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:26:23,854] [INFO] [logging.py:107:log_dist] [Rank 0] step=85, skipped=0, lr=[1e-05], mom=[0.0] +steps: 85 loss: 0.1208 iter time (s): 10.824 samples/sec: 0.092 +[2025-05-05 19:26:34,523] [INFO] [logging.py:107:log_dist] [Rank 0] step=86, skipped=0, lr=[1e-05], mom=[0.0] +steps: 86 loss: 0.0702 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:26:45,193] [INFO] [logging.py:107:log_dist] [Rank 0] step=87, skipped=0, lr=[1e-05], mom=[0.0] +steps: 87 loss: 0.0350 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:26:55,860] [INFO] [logging.py:107:log_dist] [Rank 0] step=88, skipped=0, lr=[1e-05], mom=[0.0] +steps: 88 loss: 0.1058 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:27:06,529] [INFO] [logging.py:107:log_dist] [Rank 0] step=89, skipped=0, lr=[1e-05], mom=[0.0] +steps: 89 loss: 0.0749 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:27:17,199] [INFO] [logging.py:107:log_dist] [Rank 0] step=90, skipped=0, lr=[1e-05], mom=[0.0] +steps: 90 loss: 0.0450 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:27:27,873] [INFO] [logging.py:107:log_dist] [Rank 0] step=91, skipped=0, lr=[1e-05], mom=[0.0] +steps: 91 loss: 0.1946 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:27:38,540] [INFO] [logging.py:107:log_dist] [Rank 0] step=92, skipped=0, lr=[1e-05], mom=[0.0] +steps: 92 loss: 0.0752 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:27:49,371] [INFO] [logging.py:107:log_dist] [Rank 0] step=93, skipped=0, lr=[1e-05], mom=[0.0] +steps: 93 loss: 0.0935 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-05 19:28:00,038] [INFO] [logging.py:107:log_dist] [Rank 0] step=94, skipped=0, lr=[1e-05], mom=[0.0] +steps: 94 loss: 0.0529 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:28:10,705] [INFO] [logging.py:107:log_dist] [Rank 0] step=95, skipped=0, lr=[1e-05], mom=[0.0] +steps: 95 loss: 0.0371 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:28:21,377] [INFO] [logging.py:107:log_dist] [Rank 0] step=96, skipped=0, lr=[1e-05], mom=[0.0] +steps: 96 loss: 0.0323 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:28:32,048] [INFO] [logging.py:107:log_dist] [Rank 0] step=97, skipped=0, lr=[1e-05], mom=[0.0] +steps: 97 loss: 0.0425 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:28:42,716] [INFO] [logging.py:107:log_dist] [Rank 0] step=98, skipped=0, lr=[1e-05], mom=[0.0] +steps: 98 loss: 0.0813 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:28:53,389] [INFO] [logging.py:107:log_dist] [Rank 0] step=99, skipped=0, lr=[1e-05], mom=[0.0] +steps: 99 loss: 0.0675 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:29:04,059] [INFO] [logging.py:107:log_dist] [Rank 0] step=100, skipped=0, lr=[1e-05], mom=[0.0] +steps: 100 loss: 0.0530 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:29:14,729] [INFO] [logging.py:107:log_dist] [Rank 0] step=101, skipped=0, lr=[1e-05], mom=[0.0] +steps: 101 loss: 0.1679 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:29:25,565] [INFO] [logging.py:107:log_dist] [Rank 0] step=102, skipped=0, lr=[1e-05], mom=[0.0] +steps: 102 loss: 0.0505 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-05 19:29:36,236] [INFO] [logging.py:107:log_dist] [Rank 0] step=103, skipped=0, lr=[1e-05], mom=[0.0] +steps: 103 loss: 0.1135 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:29:46,904] [INFO] [logging.py:107:log_dist] [Rank 0] step=104, skipped=0, lr=[1e-05], mom=[0.0] +steps: 104 loss: 0.0495 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:29:57,576] [INFO] [logging.py:107:log_dist] [Rank 0] step=105, skipped=0, lr=[1e-05], mom=[0.0] +steps: 105 loss: 0.0689 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:30:08,263] [INFO] [logging.py:107:log_dist] [Rank 0] step=106, skipped=0, lr=[1e-05], mom=[0.0] +steps: 106 loss: 0.0709 iter time (s): 10.655 samples/sec: 0.094 +[2025-05-05 19:30:18,932] [INFO] [logging.py:107:log_dist] [Rank 0] step=107, skipped=0, lr=[1e-05], mom=[0.0] +steps: 107 loss: 0.1550 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:30:29,606] [INFO] [logging.py:107:log_dist] [Rank 0] step=108, skipped=0, lr=[1e-05], mom=[0.0] +steps: 108 loss: 0.0455 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:30:40,277] [INFO] [logging.py:107:log_dist] [Rank 0] step=109, skipped=0, lr=[1e-05], mom=[0.0] +steps: 109 loss: 0.0447 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:30:51,110] [INFO] [logging.py:107:log_dist] [Rank 0] step=110, skipped=0, lr=[1e-05], mom=[0.0] +steps: 110 loss: 0.2602 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 19:31:01,784] [INFO] [logging.py:107:log_dist] [Rank 0] step=111, skipped=0, lr=[1e-05], mom=[0.0] +steps: 111 loss: 0.3300 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:31:12,455] [INFO] [logging.py:107:log_dist] [Rank 0] step=112, skipped=0, lr=[1e-05], mom=[0.0] +steps: 112 loss: 0.0437 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:31:23,124] [INFO] [logging.py:107:log_dist] [Rank 0] step=113, skipped=0, lr=[1e-05], mom=[0.0] +steps: 113 loss: 0.0691 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:31:33,797] [INFO] [logging.py:107:log_dist] [Rank 0] step=114, skipped=0, lr=[1e-05], mom=[0.0] +steps: 114 loss: 0.0723 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:31:44,464] [INFO] [logging.py:107:log_dist] [Rank 0] step=115, skipped=0, lr=[1e-05], mom=[0.0] +steps: 115 loss: 0.0351 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:31:55,131] [INFO] [logging.py:107:log_dist] [Rank 0] step=116, skipped=0, lr=[1e-05], mom=[0.0] +steps: 116 loss: 0.1012 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:32:05,800] [INFO] [logging.py:107:log_dist] [Rank 0] step=117, skipped=0, lr=[1e-05], mom=[0.0] +steps: 117 loss: 0.0593 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:32:16,468] [INFO] [logging.py:107:log_dist] [Rank 0] step=118, skipped=0, lr=[1e-05], mom=[0.0] +steps: 118 loss: 0.0363 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:32:27,324] [INFO] [logging.py:107:log_dist] [Rank 0] step=119, skipped=0, lr=[1e-05], mom=[0.0] +steps: 119 loss: 0.1050 iter time (s): 10.824 samples/sec: 0.092 +[2025-05-05 19:32:37,994] [INFO] [logging.py:107:log_dist] [Rank 0] step=120, skipped=0, lr=[1e-05], mom=[0.0] +steps: 120 loss: 0.0640 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:32:48,660] [INFO] [logging.py:107:log_dist] [Rank 0] step=121, skipped=0, lr=[1e-05], mom=[0.0] +steps: 121 loss: 0.0528 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:32:59,326] [INFO] [logging.py:107:log_dist] [Rank 0] step=122, skipped=0, lr=[1e-05], mom=[0.0] +steps: 122 loss: 0.0474 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:33:09,991] [INFO] [logging.py:107:log_dist] [Rank 0] step=123, skipped=0, lr=[1e-05], mom=[0.0] +steps: 123 loss: 0.2022 iter time (s): 10.638 samples/sec: 0.094 +Started new epoch: 4 +[2025-05-05 19:33:20,998] [INFO] [logging.py:107:log_dist] [Rank 0] step=124, skipped=0, lr=[1e-05], mom=[0.0] +steps: 124 loss: 0.0445 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:33:31,670] [INFO] [logging.py:107:log_dist] [Rank 0] step=125, skipped=0, lr=[1e-05], mom=[0.0] +steps: 125 loss: 0.1628 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:33:42,340] [INFO] [logging.py:107:log_dist] [Rank 0] step=126, skipped=0, lr=[1e-05], mom=[0.0] +steps: 126 loss: 0.0543 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:33:53,168] [INFO] [logging.py:107:log_dist] [Rank 0] step=127, skipped=0, lr=[1e-05], mom=[0.0] +steps: 127 loss: 0.2037 iter time (s): 10.796 samples/sec: 0.093 +[2025-05-05 19:34:03,836] [INFO] [logging.py:107:log_dist] [Rank 0] step=128, skipped=0, lr=[1e-05], mom=[0.0] +steps: 128 loss: 0.0389 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:34:14,512] [INFO] [logging.py:107:log_dist] [Rank 0] step=129, skipped=0, lr=[1e-05], mom=[0.0] +steps: 129 loss: 0.2760 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 19:34:25,187] [INFO] [logging.py:107:log_dist] [Rank 0] step=130, skipped=0, lr=[1e-05], mom=[0.0] +steps: 130 loss: 0.2546 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:34:35,854] [INFO] [logging.py:107:log_dist] [Rank 0] step=131, skipped=0, lr=[1e-05], mom=[0.0] +steps: 131 loss: 0.0355 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:34:46,527] [INFO] [logging.py:107:log_dist] [Rank 0] step=132, skipped=0, lr=[1e-05], mom=[0.0] +steps: 132 loss: 0.0389 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:34:57,191] [INFO] [logging.py:107:log_dist] [Rank 0] step=133, skipped=0, lr=[1e-05], mom=[0.0] +steps: 133 loss: 0.0879 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 19:35:07,863] [INFO] [logging.py:107:log_dist] [Rank 0] step=134, skipped=0, lr=[1e-05], mom=[0.0] +steps: 134 loss: 0.0825 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:35:18,535] [INFO] [logging.py:107:log_dist] [Rank 0] step=135, skipped=0, lr=[1e-05], mom=[0.0] +steps: 135 loss: 0.0535 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:35:29,366] [INFO] [logging.py:107:log_dist] [Rank 0] step=136, skipped=0, lr=[1e-05], mom=[0.0] +steps: 136 loss: 0.0371 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 19:35:40,036] [INFO] [logging.py:107:log_dist] [Rank 0] step=137, skipped=0, lr=[1e-05], mom=[0.0] +steps: 137 loss: 0.0397 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:35:50,715] [INFO] [logging.py:107:log_dist] [Rank 0] step=138, skipped=0, lr=[1e-05], mom=[0.0] +steps: 138 loss: 0.0460 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:36:01,385] [INFO] [logging.py:107:log_dist] [Rank 0] step=139, skipped=0, lr=[1e-05], mom=[0.0] +steps: 139 loss: 0.0487 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:36:12,065] [INFO] [logging.py:107:log_dist] [Rank 0] step=140, skipped=0, lr=[1e-05], mom=[0.0] +steps: 140 loss: 0.0600 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 19:36:22,734] [INFO] [logging.py:107:log_dist] [Rank 0] step=141, skipped=0, lr=[1e-05], mom=[0.0] +steps: 141 loss: 0.0935 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:36:33,407] [INFO] [logging.py:107:log_dist] [Rank 0] step=142, skipped=0, lr=[1e-05], mom=[0.0] +steps: 142 loss: 0.1007 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:36:44,078] [INFO] [logging.py:107:log_dist] [Rank 0] step=143, skipped=0, lr=[1e-05], mom=[0.0] +steps: 143 loss: 0.0349 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:36:54,908] [INFO] [logging.py:107:log_dist] [Rank 0] step=144, skipped=0, lr=[1e-05], mom=[0.0] +steps: 144 loss: 0.0883 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-05 19:37:05,580] [INFO] [logging.py:107:log_dist] [Rank 0] step=145, skipped=0, lr=[1e-05], mom=[0.0] +steps: 145 loss: 0.0698 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:37:16,247] [INFO] [logging.py:107:log_dist] [Rank 0] step=146, skipped=0, lr=[1e-05], mom=[0.0] +steps: 146 loss: 0.0402 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:37:26,919] [INFO] [logging.py:107:log_dist] [Rank 0] step=147, skipped=0, lr=[1e-05], mom=[0.0] +steps: 147 loss: 0.0592 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:37:37,588] [INFO] [logging.py:107:log_dist] [Rank 0] step=148, skipped=0, lr=[1e-05], mom=[0.0] +steps: 148 loss: 0.0364 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:37:48,258] [INFO] [logging.py:107:log_dist] [Rank 0] step=149, skipped=0, lr=[1e-05], mom=[0.0] +steps: 149 loss: 0.0350 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:37:58,925] [INFO] [logging.py:107:log_dist] [Rank 0] step=150, skipped=0, lr=[1e-05], mom=[0.0] +steps: 150 loss: 0.0382 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:38:09,592] [INFO] [logging.py:107:log_dist] [Rank 0] step=151, skipped=0, lr=[1e-05], mom=[0.0] +steps: 151 loss: 0.0776 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:38:20,260] [INFO] [logging.py:107:log_dist] [Rank 0] step=152, skipped=0, lr=[1e-05], mom=[0.0] +steps: 152 loss: 0.1232 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:38:31,118] [INFO] [logging.py:107:log_dist] [Rank 0] step=153, skipped=0, lr=[1e-05], mom=[0.0] +steps: 153 loss: 0.1334 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-05 19:38:41,786] [INFO] [logging.py:107:log_dist] [Rank 0] step=154, skipped=0, lr=[1e-05], mom=[0.0] +steps: 154 loss: 0.4678 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:38:52,463] [INFO] [logging.py:107:log_dist] [Rank 0] step=155, skipped=0, lr=[1e-05], mom=[0.0] +steps: 155 loss: 0.0427 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 19:39:03,131] [INFO] [logging.py:107:log_dist] [Rank 0] step=156, skipped=0, lr=[1e-05], mom=[0.0] +steps: 156 loss: 0.0379 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:39:13,798] [INFO] [logging.py:107:log_dist] [Rank 0] step=157, skipped=0, lr=[1e-05], mom=[0.0] +steps: 157 loss: 0.1474 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:39:24,478] [INFO] [logging.py:107:log_dist] [Rank 0] step=158, skipped=0, lr=[1e-05], mom=[0.0] +steps: 158 loss: 0.0304 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 19:39:35,147] [INFO] [logging.py:107:log_dist] [Rank 0] step=159, skipped=0, lr=[1e-05], mom=[0.0] +steps: 159 loss: 0.0467 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:39:45,809] [INFO] [logging.py:107:log_dist] [Rank 0] step=160, skipped=0, lr=[1e-05], mom=[0.0] +steps: 160 loss: 0.0711 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-05 19:39:56,639] [INFO] [logging.py:107:log_dist] [Rank 0] step=161, skipped=0, lr=[1e-05], mom=[0.0] +steps: 161 loss: 0.0395 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-05 19:40:07,306] [INFO] [logging.py:107:log_dist] [Rank 0] step=162, skipped=0, lr=[1e-05], mom=[0.0] +steps: 162 loss: 0.0746 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:40:17,971] [INFO] [logging.py:107:log_dist] [Rank 0] step=163, skipped=0, lr=[1e-05], mom=[0.0] +steps: 163 loss: 0.1182 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:40:28,637] [INFO] [logging.py:107:log_dist] [Rank 0] step=164, skipped=0, lr=[1e-05], mom=[0.0] +steps: 164 loss: 0.0731 iter time (s): 10.640 samples/sec: 0.094 +Started new epoch: 5 +[2025-05-05 19:40:39,657] [INFO] [logging.py:107:log_dist] [Rank 0] step=165, skipped=0, lr=[1e-05], mom=[0.0] +steps: 165 loss: 0.2328 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:40:50,324] [INFO] [logging.py:107:log_dist] [Rank 0] step=166, skipped=0, lr=[1e-05], mom=[0.0] +steps: 166 loss: 0.0519 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:41:00,994] [INFO] [logging.py:107:log_dist] [Rank 0] step=167, skipped=0, lr=[1e-05], mom=[0.0] +steps: 167 loss: 0.2065 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:41:11,661] [INFO] [logging.py:107:log_dist] [Rank 0] step=168, skipped=0, lr=[1e-05], mom=[0.0] +steps: 168 loss: 0.0324 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:41:22,327] [INFO] [logging.py:107:log_dist] [Rank 0] step=169, skipped=0, lr=[1e-05], mom=[0.0] +steps: 169 loss: 0.1184 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:41:33,164] [INFO] [logging.py:107:log_dist] [Rank 0] step=170, skipped=0, lr=[1e-05], mom=[0.0] +steps: 170 loss: 0.0386 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 19:41:43,841] [INFO] [logging.py:107:log_dist] [Rank 0] step=171, skipped=0, lr=[1e-05], mom=[0.0] +steps: 171 loss: 0.0799 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:41:54,509] [INFO] [logging.py:107:log_dist] [Rank 0] step=172, skipped=0, lr=[1e-05], mom=[0.0] +steps: 172 loss: 0.0577 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:42:05,181] [INFO] [logging.py:107:log_dist] [Rank 0] step=173, skipped=0, lr=[1e-05], mom=[0.0] +steps: 173 loss: 0.1030 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:42:15,849] [INFO] [logging.py:107:log_dist] [Rank 0] step=174, skipped=0, lr=[1e-05], mom=[0.0] +steps: 174 loss: 0.0536 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:42:26,517] [INFO] [logging.py:107:log_dist] [Rank 0] step=175, skipped=0, lr=[1e-05], mom=[0.0] +steps: 175 loss: 0.0475 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:42:37,189] [INFO] [logging.py:107:log_dist] [Rank 0] step=176, skipped=0, lr=[1e-05], mom=[0.0] +steps: 176 loss: 0.0767 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:42:47,856] [INFO] [logging.py:107:log_dist] [Rank 0] step=177, skipped=0, lr=[1e-05], mom=[0.0] +steps: 177 loss: 0.0563 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:42:58,692] [INFO] [logging.py:107:log_dist] [Rank 0] step=178, skipped=0, lr=[1e-05], mom=[0.0] +steps: 178 loss: 0.1059 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-05 19:43:09,362] [INFO] [logging.py:107:log_dist] [Rank 0] step=179, skipped=0, lr=[1e-05], mom=[0.0] +steps: 179 loss: 0.0614 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:43:20,031] [INFO] [logging.py:107:log_dist] [Rank 0] step=180, skipped=0, lr=[1e-05], mom=[0.0] +steps: 180 loss: 0.0372 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:43:30,698] [INFO] [logging.py:107:log_dist] [Rank 0] step=181, skipped=0, lr=[1e-05], mom=[0.0] +steps: 181 loss: 0.0513 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:43:41,370] [INFO] [logging.py:107:log_dist] [Rank 0] step=182, skipped=0, lr=[1e-05], mom=[0.0] +steps: 182 loss: 0.0427 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:43:52,036] [INFO] [logging.py:107:log_dist] [Rank 0] step=183, skipped=0, lr=[1e-05], mom=[0.0] +steps: 183 loss: 0.0445 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:44:02,704] [INFO] [logging.py:107:log_dist] [Rank 0] step=184, skipped=0, lr=[1e-05], mom=[0.0] +steps: 184 loss: 0.1420 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:44:13,374] [INFO] [logging.py:107:log_dist] [Rank 0] step=185, skipped=0, lr=[1e-05], mom=[0.0] +steps: 185 loss: 0.0898 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:44:24,052] [INFO] [logging.py:107:log_dist] [Rank 0] step=186, skipped=0, lr=[1e-05], mom=[0.0] +steps: 186 loss: 0.0561 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 19:44:34,908] [INFO] [logging.py:107:log_dist] [Rank 0] step=187, skipped=0, lr=[1e-05], mom=[0.0] +steps: 187 loss: 0.0540 iter time (s): 10.824 samples/sec: 0.092 +[2025-05-05 19:44:45,578] [INFO] [logging.py:107:log_dist] [Rank 0] step=188, skipped=0, lr=[1e-05], mom=[0.0] +steps: 188 loss: 0.0681 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:44:56,247] [INFO] [logging.py:107:log_dist] [Rank 0] step=189, skipped=0, lr=[1e-05], mom=[0.0] +steps: 189 loss: 0.0785 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:45:06,919] [INFO] [logging.py:107:log_dist] [Rank 0] step=190, skipped=0, lr=[1e-05], mom=[0.0] +steps: 190 loss: 0.0522 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:45:17,593] [INFO] [logging.py:107:log_dist] [Rank 0] step=191, skipped=0, lr=[1e-05], mom=[0.0] +steps: 191 loss: 0.0369 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:45:28,265] [INFO] [logging.py:107:log_dist] [Rank 0] step=192, skipped=0, lr=[1e-05], mom=[0.0] +steps: 192 loss: 0.0678 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:45:38,936] [INFO] [logging.py:107:log_dist] [Rank 0] step=193, skipped=0, lr=[1e-05], mom=[0.0] +steps: 193 loss: 0.1732 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:45:49,607] [INFO] [logging.py:107:log_dist] [Rank 0] step=194, skipped=0, lr=[1e-05], mom=[0.0] +steps: 194 loss: 0.0484 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:46:00,435] [INFO] [logging.py:107:log_dist] [Rank 0] step=195, skipped=0, lr=[1e-05], mom=[0.0] +steps: 195 loss: 0.0376 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-05 19:46:11,103] [INFO] [logging.py:107:log_dist] [Rank 0] step=196, skipped=0, lr=[1e-05], mom=[0.0] +steps: 196 loss: 0.0662 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:46:21,774] [INFO] [logging.py:107:log_dist] [Rank 0] step=197, skipped=0, lr=[1e-05], mom=[0.0] +steps: 197 loss: 0.0720 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:46:32,444] [INFO] [logging.py:107:log_dist] [Rank 0] step=198, skipped=0, lr=[1e-05], mom=[0.0] +steps: 198 loss: 0.0544 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:46:43,113] [INFO] [logging.py:107:log_dist] [Rank 0] step=199, skipped=0, lr=[1e-05], mom=[0.0] +steps: 199 loss: 0.0907 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:46:53,790] [INFO] [logging.py:107:log_dist] [Rank 0] step=200, skipped=0, lr=[1e-05], mom=[0.0] +steps: 200 loss: 0.1065 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 19:47:04,457] [INFO] [logging.py:107:log_dist] [Rank 0] step=201, skipped=0, lr=[1e-05], mom=[0.0] +steps: 201 loss: 0.0397 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:47:15,127] [INFO] [logging.py:107:log_dist] [Rank 0] step=202, skipped=0, lr=[1e-05], mom=[0.0] +steps: 202 loss: 0.1689 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:47:25,801] [INFO] [logging.py:107:log_dist] [Rank 0] step=203, skipped=0, lr=[1e-05], mom=[0.0] +steps: 203 loss: 0.0505 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 19:47:36,641] [INFO] [logging.py:107:log_dist] [Rank 0] step=204, skipped=0, lr=[1e-05], mom=[0.0] +steps: 204 loss: 0.0870 iter time (s): 10.809 samples/sec: 0.093 +[2025-05-05 19:47:47,301] [INFO] [logging.py:107:log_dist] [Rank 0] step=205, skipped=0, lr=[1e-05], mom=[0.0] +steps: 205 loss: 0.0498 iter time (s): 10.633 samples/sec: 0.094 +Started new epoch: 6 +[2025-05-05 19:47:58,310] [INFO] [logging.py:107:log_dist] [Rank 0] step=206, skipped=0, lr=[1e-05], mom=[0.0] +steps: 206 loss: 0.0401 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:48:08,982] [INFO] [logging.py:107:log_dist] [Rank 0] step=207, skipped=0, lr=[1e-05], mom=[0.0] +steps: 207 loss: 0.0978 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:48:19,659] [INFO] [logging.py:107:log_dist] [Rank 0] step=208, skipped=0, lr=[1e-05], mom=[0.0] +steps: 208 loss: 0.0489 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 19:48:30,332] [INFO] [logging.py:107:log_dist] [Rank 0] step=209, skipped=0, lr=[1e-05], mom=[0.0] +steps: 209 loss: 0.0384 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:48:40,997] [INFO] [logging.py:107:log_dist] [Rank 0] step=210, skipped=0, lr=[1e-05], mom=[0.0] +steps: 210 loss: 0.0395 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:48:51,668] [INFO] [logging.py:107:log_dist] [Rank 0] step=211, skipped=0, lr=[1e-05], mom=[0.0] +steps: 211 loss: 0.1135 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:49:02,504] [INFO] [logging.py:107:log_dist] [Rank 0] step=212, skipped=0, lr=[1e-05], mom=[0.0] +steps: 212 loss: 0.0834 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-05 19:49:13,171] [INFO] [logging.py:107:log_dist] [Rank 0] step=213, skipped=0, lr=[1e-05], mom=[0.0] +steps: 213 loss: 0.0817 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:49:23,840] [INFO] [logging.py:107:log_dist] [Rank 0] step=214, skipped=0, lr=[1e-05], mom=[0.0] +steps: 214 loss: 0.0503 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:49:34,512] [INFO] [logging.py:107:log_dist] [Rank 0] step=215, skipped=0, lr=[1e-05], mom=[0.0] +steps: 215 loss: 0.2206 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:49:45,181] [INFO] [logging.py:107:log_dist] [Rank 0] step=216, skipped=0, lr=[1e-05], mom=[0.0] +steps: 216 loss: 0.0830 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:49:55,850] [INFO] [logging.py:107:log_dist] [Rank 0] step=217, skipped=0, lr=[1e-05], mom=[0.0] +steps: 217 loss: 0.0374 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:50:06,522] [INFO] [logging.py:107:log_dist] [Rank 0] step=218, skipped=0, lr=[1e-05], mom=[0.0] +steps: 218 loss: 0.0513 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:50:17,189] [INFO] [logging.py:107:log_dist] [Rank 0] step=219, skipped=0, lr=[1e-05], mom=[0.0] +steps: 219 loss: 0.3386 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:50:27,861] [INFO] [logging.py:107:log_dist] [Rank 0] step=220, skipped=0, lr=[1e-05], mom=[0.0] +steps: 220 loss: 0.0364 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:50:38,713] [INFO] [logging.py:107:log_dist] [Rank 0] step=221, skipped=0, lr=[1e-05], mom=[0.0] +steps: 221 loss: 0.0604 iter time (s): 10.821 samples/sec: 0.092 +[2025-05-05 19:50:49,381] [INFO] [logging.py:107:log_dist] [Rank 0] step=222, skipped=0, lr=[1e-05], mom=[0.0] +steps: 222 loss: 0.0944 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:51:00,052] [INFO] [logging.py:107:log_dist] [Rank 0] step=223, skipped=0, lr=[1e-05], mom=[0.0] +steps: 223 loss: 0.0361 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:51:10,719] [INFO] [logging.py:107:log_dist] [Rank 0] step=224, skipped=0, lr=[1e-05], mom=[0.0] +steps: 224 loss: 0.0442 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:51:21,387] [INFO] [logging.py:107:log_dist] [Rank 0] step=225, skipped=0, lr=[1e-05], mom=[0.0] +steps: 225 loss: 0.1011 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:51:32,058] [INFO] [logging.py:107:log_dist] [Rank 0] step=226, skipped=0, lr=[1e-05], mom=[0.0] +steps: 226 loss: 0.3403 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:51:42,727] [INFO] [logging.py:107:log_dist] [Rank 0] step=227, skipped=0, lr=[1e-05], mom=[0.0] +steps: 227 loss: 0.0699 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:51:53,395] [INFO] [logging.py:107:log_dist] [Rank 0] step=228, skipped=0, lr=[1e-05], mom=[0.0] +steps: 228 loss: 0.0795 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:52:04,232] [INFO] [logging.py:107:log_dist] [Rank 0] step=229, skipped=0, lr=[1e-05], mom=[0.0] +steps: 229 loss: 0.0413 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-05 19:52:14,900] [INFO] [logging.py:107:log_dist] [Rank 0] step=230, skipped=0, lr=[1e-05], mom=[0.0] +steps: 230 loss: 0.0845 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:52:25,571] [INFO] [logging.py:107:log_dist] [Rank 0] step=231, skipped=0, lr=[1e-05], mom=[0.0] +steps: 231 loss: 0.1472 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:52:36,241] [INFO] [logging.py:107:log_dist] [Rank 0] step=232, skipped=0, lr=[1e-05], mom=[0.0] +steps: 232 loss: 0.0576 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:52:46,915] [INFO] [logging.py:107:log_dist] [Rank 0] step=233, skipped=0, lr=[1e-05], mom=[0.0] +steps: 233 loss: 0.0676 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 19:52:57,581] [INFO] [logging.py:107:log_dist] [Rank 0] step=234, skipped=0, lr=[1e-05], mom=[0.0] +steps: 234 loss: 0.0666 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:53:08,248] [INFO] [logging.py:107:log_dist] [Rank 0] step=235, skipped=0, lr=[1e-05], mom=[0.0] +steps: 235 loss: 0.0779 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:53:18,919] [INFO] [logging.py:107:log_dist] [Rank 0] step=236, skipped=0, lr=[1e-05], mom=[0.0] +steps: 236 loss: 0.1183 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:53:29,587] [INFO] [logging.py:107:log_dist] [Rank 0] step=237, skipped=0, lr=[1e-05], mom=[0.0] +steps: 237 loss: 0.1838 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:53:40,432] [INFO] [logging.py:107:log_dist] [Rank 0] step=238, skipped=0, lr=[1e-05], mom=[0.0] +steps: 238 loss: 0.0405 iter time (s): 10.814 samples/sec: 0.092 +[2025-05-05 19:53:51,101] [INFO] [logging.py:107:log_dist] [Rank 0] step=239, skipped=0, lr=[1e-05], mom=[0.0] +steps: 239 loss: 0.0509 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:54:01,769] [INFO] [logging.py:107:log_dist] [Rank 0] step=240, skipped=0, lr=[1e-05], mom=[0.0] +steps: 240 loss: 0.0647 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:54:12,442] [INFO] [logging.py:107:log_dist] [Rank 0] step=241, skipped=0, lr=[1e-05], mom=[0.0] +steps: 241 loss: 0.1580 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:54:23,109] [INFO] [logging.py:107:log_dist] [Rank 0] step=242, skipped=0, lr=[1e-05], mom=[0.0] +steps: 242 loss: 0.0825 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:54:33,778] [INFO] [logging.py:107:log_dist] [Rank 0] step=243, skipped=0, lr=[1e-05], mom=[0.0] +steps: 243 loss: 0.0562 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:54:44,448] [INFO] [logging.py:107:log_dist] [Rank 0] step=244, skipped=0, lr=[1e-05], mom=[0.0] +steps: 244 loss: 0.0704 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:54:55,122] [INFO] [logging.py:107:log_dist] [Rank 0] step=245, skipped=0, lr=[1e-05], mom=[0.0] +steps: 245 loss: 0.1041 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:55:05,946] [INFO] [logging.py:107:log_dist] [Rank 0] step=246, skipped=0, lr=[1e-05], mom=[0.0] +steps: 246 loss: 0.0827 iter time (s): 10.797 samples/sec: 0.093 +Started new epoch: 7 +[2025-05-05 19:55:16,981] [INFO] [logging.py:107:log_dist] [Rank 0] step=247, skipped=0, lr=[1e-05], mom=[0.0] +steps: 247 loss: 0.1176 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 19:55:27,648] [INFO] [logging.py:107:log_dist] [Rank 0] step=248, skipped=0, lr=[1e-05], mom=[0.0] +steps: 248 loss: 0.2182 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:55:38,322] [INFO] [logging.py:107:log_dist] [Rank 0] step=249, skipped=0, lr=[1e-05], mom=[0.0] +steps: 249 loss: 0.0959 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 19:55:48,995] [INFO] [logging.py:107:log_dist] [Rank 0] step=250, skipped=0, lr=[1e-05], mom=[0.0] +steps: 250 loss: 0.0474 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:55:59,660] [INFO] [logging.py:107:log_dist] [Rank 0] step=251, skipped=0, lr=[1e-05], mom=[0.0] +steps: 251 loss: 0.0355 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-05 19:56:10,334] [INFO] [logging.py:107:log_dist] [Rank 0] step=252, skipped=0, lr=[1e-05], mom=[0.0] +steps: 252 loss: 0.2212 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 19:56:21,006] [INFO] [logging.py:107:log_dist] [Rank 0] step=253, skipped=0, lr=[1e-05], mom=[0.0] +steps: 253 loss: 0.0405 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:56:31,673] [INFO] [logging.py:107:log_dist] [Rank 0] step=254, skipped=0, lr=[1e-05], mom=[0.0] +steps: 254 loss: 0.0340 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 19:56:42,530] [INFO] [logging.py:107:log_dist] [Rank 0] step=255, skipped=0, lr=[1e-05], mom=[0.0] +steps: 255 loss: 0.0611 iter time (s): 10.826 samples/sec: 0.092 +[2025-05-05 19:56:53,197] [INFO] [logging.py:107:log_dist] [Rank 0] step=256, skipped=0, lr=[1e-05], mom=[0.0] +steps: 256 loss: 0.1018 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 19:57:03,864] [INFO] [logging.py:107:log_dist] [Rank 0] step=257, skipped=0, lr=[1e-05], mom=[0.0] +steps: 257 loss: 0.1831 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 19:57:14,533] [INFO] [logging.py:107:log_dist] [Rank 0] step=258, skipped=0, lr=[1e-05], mom=[0.0] +steps: 258 loss: 0.2341 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:57:25,205] [INFO] [logging.py:107:log_dist] [Rank 0] step=259, skipped=0, lr=[1e-05], mom=[0.0] +steps: 259 loss: 0.0380 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:57:35,877] [INFO] [logging.py:107:log_dist] [Rank 0] step=260, skipped=0, lr=[1e-05], mom=[0.0] +steps: 260 loss: 0.0500 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:57:46,547] [INFO] [logging.py:107:log_dist] [Rank 0] step=261, skipped=0, lr=[1e-05], mom=[0.0] +steps: 261 loss: 0.1631 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:57:57,217] [INFO] [logging.py:107:log_dist] [Rank 0] step=262, skipped=0, lr=[1e-05], mom=[0.0] +steps: 262 loss: 0.0630 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:58:08,044] [INFO] [logging.py:107:log_dist] [Rank 0] step=263, skipped=0, lr=[1e-05], mom=[0.0] +steps: 263 loss: 0.0545 iter time (s): 10.796 samples/sec: 0.093 +[2025-05-05 19:58:18,715] [INFO] [logging.py:107:log_dist] [Rank 0] step=264, skipped=0, lr=[1e-05], mom=[0.0] +steps: 264 loss: 0.0566 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:58:29,384] [INFO] [logging.py:107:log_dist] [Rank 0] step=265, skipped=0, lr=[1e-05], mom=[0.0] +steps: 265 loss: 0.0444 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 19:58:40,053] [INFO] [logging.py:107:log_dist] [Rank 0] step=266, skipped=0, lr=[1e-05], mom=[0.0] +steps: 266 loss: 0.1065 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 19:58:50,731] [INFO] [logging.py:107:log_dist] [Rank 0] step=267, skipped=0, lr=[1e-05], mom=[0.0] +steps: 267 loss: 0.0464 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 19:59:01,414] [INFO] [logging.py:107:log_dist] [Rank 0] step=268, skipped=0, lr=[1e-05], mom=[0.0] +steps: 268 loss: 0.0490 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-05 19:59:12,088] [INFO] [logging.py:107:log_dist] [Rank 0] step=269, skipped=0, lr=[1e-05], mom=[0.0] +steps: 269 loss: 0.0557 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 19:59:22,761] [INFO] [logging.py:107:log_dist] [Rank 0] step=270, skipped=0, lr=[1e-05], mom=[0.0] +steps: 270 loss: 0.0555 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 19:59:33,433] [INFO] [logging.py:107:log_dist] [Rank 0] step=271, skipped=0, lr=[1e-05], mom=[0.0] +steps: 271 loss: 0.0362 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 19:59:44,261] [INFO] [logging.py:107:log_dist] [Rank 0] step=272, skipped=0, lr=[1e-05], mom=[0.0] +steps: 272 loss: 0.2686 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-05 19:59:54,931] [INFO] [logging.py:107:log_dist] [Rank 0] step=273, skipped=0, lr=[1e-05], mom=[0.0] +steps: 273 loss: 0.1079 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:00:05,602] [INFO] [logging.py:107:log_dist] [Rank 0] step=274, skipped=0, lr=[1e-05], mom=[0.0] +steps: 274 loss: 0.0322 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:00:16,268] [INFO] [logging.py:107:log_dist] [Rank 0] step=275, skipped=0, lr=[1e-05], mom=[0.0] +steps: 275 loss: 0.0645 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:00:26,941] [INFO] [logging.py:107:log_dist] [Rank 0] step=276, skipped=0, lr=[1e-05], mom=[0.0] +steps: 276 loss: 0.0995 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:00:37,606] [INFO] [logging.py:107:log_dist] [Rank 0] step=277, skipped=0, lr=[1e-05], mom=[0.0] +steps: 277 loss: 0.4038 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:00:48,273] [INFO] [logging.py:107:log_dist] [Rank 0] step=278, skipped=0, lr=[1e-05], mom=[0.0] +steps: 278 loss: 0.0391 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:00:58,945] [INFO] [logging.py:107:log_dist] [Rank 0] step=279, skipped=0, lr=[1e-05], mom=[0.0] +steps: 279 loss: 0.2418 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:01:09,612] [INFO] [logging.py:107:log_dist] [Rank 0] step=280, skipped=0, lr=[1e-05], mom=[0.0] +steps: 280 loss: 0.0366 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:01:20,468] [INFO] [logging.py:107:log_dist] [Rank 0] step=281, skipped=0, lr=[1e-05], mom=[0.0] +steps: 281 loss: 0.0656 iter time (s): 10.826 samples/sec: 0.092 +[2025-05-05 20:01:31,143] [INFO] [logging.py:107:log_dist] [Rank 0] step=282, skipped=0, lr=[1e-05], mom=[0.0] +steps: 282 loss: 0.1250 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:01:41,808] [INFO] [logging.py:107:log_dist] [Rank 0] step=283, skipped=0, lr=[1e-05], mom=[0.0] +steps: 283 loss: 0.0740 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 20:01:52,475] [INFO] [logging.py:107:log_dist] [Rank 0] step=284, skipped=0, lr=[1e-05], mom=[0.0] +steps: 284 loss: 0.0601 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:02:03,147] [INFO] [logging.py:107:log_dist] [Rank 0] step=285, skipped=0, lr=[1e-05], mom=[0.0] +steps: 285 loss: 0.2858 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:02:13,815] [INFO] [logging.py:107:log_dist] [Rank 0] step=286, skipped=0, lr=[1e-05], mom=[0.0] +steps: 286 loss: 0.2112 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:02:24,489] [INFO] [logging.py:107:log_dist] [Rank 0] step=287, skipped=0, lr=[1e-05], mom=[0.0] +steps: 287 loss: 0.0977 iter time (s): 10.646 samples/sec: 0.094 +Started new epoch: 8 +[2025-05-05 20:02:35,508] [INFO] [logging.py:107:log_dist] [Rank 0] step=288, skipped=0, lr=[1e-05], mom=[0.0] +steps: 288 loss: 0.0447 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:02:46,336] [INFO] [logging.py:107:log_dist] [Rank 0] step=289, skipped=0, lr=[1e-05], mom=[0.0] +steps: 289 loss: 0.2688 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-05 20:02:57,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=290, skipped=0, lr=[1e-05], mom=[0.0] +steps: 290 loss: 0.0360 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:03:07,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=291, skipped=0, lr=[1e-05], mom=[0.0] +steps: 291 loss: 0.0496 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:03:18,343] [INFO] [logging.py:107:log_dist] [Rank 0] step=292, skipped=0, lr=[1e-05], mom=[0.0] +steps: 292 loss: 0.0706 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:03:29,016] [INFO] [logging.py:107:log_dist] [Rank 0] step=293, skipped=0, lr=[1e-05], mom=[0.0] +steps: 293 loss: 0.1021 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:03:39,686] [INFO] [logging.py:107:log_dist] [Rank 0] step=294, skipped=0, lr=[1e-05], mom=[0.0] +steps: 294 loss: 0.0521 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:03:50,354] [INFO] [logging.py:107:log_dist] [Rank 0] step=295, skipped=0, lr=[1e-05], mom=[0.0] +steps: 295 loss: 0.2159 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:04:01,021] [INFO] [logging.py:107:log_dist] [Rank 0] step=296, skipped=0, lr=[1e-05], mom=[0.0] +steps: 296 loss: 0.0544 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:04:11,691] [INFO] [logging.py:107:log_dist] [Rank 0] step=297, skipped=0, lr=[1e-05], mom=[0.0] +steps: 297 loss: 0.0816 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:04:22,518] [INFO] [logging.py:107:log_dist] [Rank 0] step=298, skipped=0, lr=[1e-05], mom=[0.0] +steps: 298 loss: 0.1011 iter time (s): 10.795 samples/sec: 0.093 +[2025-05-05 20:04:33,187] [INFO] [logging.py:107:log_dist] [Rank 0] step=299, skipped=0, lr=[1e-05], mom=[0.0] +steps: 299 loss: 0.0622 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:04:43,860] [INFO] [logging.py:107:log_dist] [Rank 0] step=300, skipped=0, lr=[1e-05], mom=[0.0] +steps: 300 loss: 0.0794 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:04:54,530] [INFO] [logging.py:107:log_dist] [Rank 0] step=301, skipped=0, lr=[1e-05], mom=[0.0] +steps: 301 loss: 0.0441 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:05:05,198] [INFO] [logging.py:107:log_dist] [Rank 0] step=302, skipped=0, lr=[1e-05], mom=[0.0] +steps: 302 loss: 0.1055 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:05:15,873] [INFO] [logging.py:107:log_dist] [Rank 0] step=303, skipped=0, lr=[1e-05], mom=[0.0] +steps: 303 loss: 0.0925 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 20:05:26,543] [INFO] [logging.py:107:log_dist] [Rank 0] step=304, skipped=0, lr=[1e-05], mom=[0.0] +steps: 304 loss: 0.0948 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:05:37,219] [INFO] [logging.py:107:log_dist] [Rank 0] step=305, skipped=0, lr=[1e-05], mom=[0.0] +steps: 305 loss: 0.0452 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 20:05:48,078] [INFO] [logging.py:107:log_dist] [Rank 0] step=306, skipped=0, lr=[1e-05], mom=[0.0] +steps: 306 loss: 0.2416 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-05 20:05:58,755] [INFO] [logging.py:107:log_dist] [Rank 0] step=307, skipped=0, lr=[1e-05], mom=[0.0] +steps: 307 loss: 0.1429 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 20:06:09,424] [INFO] [logging.py:107:log_dist] [Rank 0] step=308, skipped=0, lr=[1e-05], mom=[0.0] +steps: 308 loss: 0.1464 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:06:20,091] [INFO] [logging.py:107:log_dist] [Rank 0] step=309, skipped=0, lr=[1e-05], mom=[0.0] +steps: 309 loss: 0.0565 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:06:30,764] [INFO] [logging.py:107:log_dist] [Rank 0] step=310, skipped=0, lr=[1e-05], mom=[0.0] +steps: 310 loss: 0.1179 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:06:41,435] [INFO] [logging.py:107:log_dist] [Rank 0] step=311, skipped=0, lr=[1e-05], mom=[0.0] +steps: 311 loss: 0.3990 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:06:52,106] [INFO] [logging.py:107:log_dist] [Rank 0] step=312, skipped=0, lr=[1e-05], mom=[0.0] +steps: 312 loss: 0.0393 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:07:02,775] [INFO] [logging.py:107:log_dist] [Rank 0] step=313, skipped=0, lr=[1e-05], mom=[0.0] +steps: 313 loss: 0.0627 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:07:13,446] [INFO] [logging.py:107:log_dist] [Rank 0] step=314, skipped=0, lr=[1e-05], mom=[0.0] +steps: 314 loss: 0.0588 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:07:24,308] [INFO] [logging.py:107:log_dist] [Rank 0] step=315, skipped=0, lr=[1e-05], mom=[0.0] +steps: 315 loss: 0.0601 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-05 20:07:34,977] [INFO] [logging.py:107:log_dist] [Rank 0] step=316, skipped=0, lr=[1e-05], mom=[0.0] +steps: 316 loss: 0.0820 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:07:45,648] [INFO] [logging.py:107:log_dist] [Rank 0] step=317, skipped=0, lr=[1e-05], mom=[0.0] +steps: 317 loss: 0.2169 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:07:56,318] [INFO] [logging.py:107:log_dist] [Rank 0] step=318, skipped=0, lr=[1e-05], mom=[0.0] +steps: 318 loss: 0.0975 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:08:06,984] [INFO] [logging.py:107:log_dist] [Rank 0] step=319, skipped=0, lr=[1e-05], mom=[0.0] +steps: 319 loss: 0.0476 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:08:17,655] [INFO] [logging.py:107:log_dist] [Rank 0] step=320, skipped=0, lr=[1e-05], mom=[0.0] +steps: 320 loss: 0.1165 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:08:28,326] [INFO] [logging.py:107:log_dist] [Rank 0] step=321, skipped=0, lr=[1e-05], mom=[0.0] +steps: 321 loss: 0.1570 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:08:38,992] [INFO] [logging.py:107:log_dist] [Rank 0] step=322, skipped=0, lr=[1e-05], mom=[0.0] +steps: 322 loss: 0.0394 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:08:49,860] [INFO] [logging.py:107:log_dist] [Rank 0] step=323, skipped=0, lr=[1e-05], mom=[0.0] +steps: 323 loss: 0.0614 iter time (s): 10.837 samples/sec: 0.092 +[2025-05-05 20:09:00,527] [INFO] [logging.py:107:log_dist] [Rank 0] step=324, skipped=0, lr=[1e-05], mom=[0.0] +steps: 324 loss: 0.1966 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:09:11,196] [INFO] [logging.py:107:log_dist] [Rank 0] step=325, skipped=0, lr=[1e-05], mom=[0.0] +steps: 325 loss: 0.0481 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:09:21,866] [INFO] [logging.py:107:log_dist] [Rank 0] step=326, skipped=0, lr=[1e-05], mom=[0.0] +steps: 326 loss: 0.1458 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:09:32,535] [INFO] [logging.py:107:log_dist] [Rank 0] step=327, skipped=0, lr=[1e-05], mom=[0.0] +steps: 327 loss: 0.0601 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:09:43,200] [INFO] [logging.py:107:log_dist] [Rank 0] step=328, skipped=0, lr=[1e-05], mom=[0.0] +steps: 328 loss: 0.0920 iter time (s): 10.637 samples/sec: 0.094 +Started new epoch: 9 +[2025-05-05 20:09:54,206] [INFO] [logging.py:107:log_dist] [Rank 0] step=329, skipped=0, lr=[1e-05], mom=[0.0] +steps: 329 loss: 0.0680 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:10:04,878] [INFO] [logging.py:107:log_dist] [Rank 0] step=330, skipped=0, lr=[1e-05], mom=[0.0] +steps: 330 loss: 0.0446 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:10:15,545] [INFO] [logging.py:107:log_dist] [Rank 0] step=331, skipped=0, lr=[1e-05], mom=[0.0] +steps: 331 loss: 0.0515 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:10:26,410] [INFO] [logging.py:107:log_dist] [Rank 0] step=332, skipped=0, lr=[1e-05], mom=[0.0] +steps: 332 loss: 0.0647 iter time (s): 10.834 samples/sec: 0.092 +[2025-05-05 20:10:37,083] [INFO] [logging.py:107:log_dist] [Rank 0] step=333, skipped=0, lr=[1e-05], mom=[0.0] +steps: 333 loss: 0.0371 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:10:47,753] [INFO] [logging.py:107:log_dist] [Rank 0] step=334, skipped=0, lr=[1e-05], mom=[0.0] +steps: 334 loss: 0.0524 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:10:58,421] [INFO] [logging.py:107:log_dist] [Rank 0] step=335, skipped=0, lr=[1e-05], mom=[0.0] +steps: 335 loss: 0.0450 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:11:09,088] [INFO] [logging.py:107:log_dist] [Rank 0] step=336, skipped=0, lr=[1e-05], mom=[0.0] +steps: 336 loss: 0.0295 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:11:19,751] [INFO] [logging.py:107:log_dist] [Rank 0] step=337, skipped=0, lr=[1e-05], mom=[0.0] +steps: 337 loss: 0.1177 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-05 20:11:30,423] [INFO] [logging.py:107:log_dist] [Rank 0] step=338, skipped=0, lr=[1e-05], mom=[0.0] +steps: 338 loss: 0.1284 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:11:41,090] [INFO] [logging.py:107:log_dist] [Rank 0] step=339, skipped=0, lr=[1e-05], mom=[0.0] +steps: 339 loss: 0.1410 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 20:11:51,757] [INFO] [logging.py:107:log_dist] [Rank 0] step=340, skipped=0, lr=[1e-05], mom=[0.0] +steps: 340 loss: 0.0336 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:12:02,623] [INFO] [logging.py:107:log_dist] [Rank 0] step=341, skipped=0, lr=[1e-05], mom=[0.0] +steps: 341 loss: 0.1107 iter time (s): 10.834 samples/sec: 0.092 +[2025-05-05 20:12:13,289] [INFO] [logging.py:107:log_dist] [Rank 0] step=342, skipped=0, lr=[1e-05], mom=[0.0] +steps: 342 loss: 0.0464 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:12:23,961] [INFO] [logging.py:107:log_dist] [Rank 0] step=343, skipped=0, lr=[1e-05], mom=[0.0] +steps: 343 loss: 0.2182 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:12:34,634] [INFO] [logging.py:107:log_dist] [Rank 0] step=344, skipped=0, lr=[1e-05], mom=[0.0] +steps: 344 loss: 0.0355 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:12:45,304] [INFO] [logging.py:107:log_dist] [Rank 0] step=345, skipped=0, lr=[1e-05], mom=[0.0] +steps: 345 loss: 0.0413 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:12:55,975] [INFO] [logging.py:107:log_dist] [Rank 0] step=346, skipped=0, lr=[1e-05], mom=[0.0] +steps: 346 loss: 0.2124 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:13:06,649] [INFO] [logging.py:107:log_dist] [Rank 0] step=347, skipped=0, lr=[1e-05], mom=[0.0] +steps: 347 loss: 0.0425 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:13:17,329] [INFO] [logging.py:107:log_dist] [Rank 0] step=348, skipped=0, lr=[1e-05], mom=[0.0] +steps: 348 loss: 0.0599 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:13:27,999] [INFO] [logging.py:107:log_dist] [Rank 0] step=349, skipped=0, lr=[1e-05], mom=[0.0] +steps: 349 loss: 0.1253 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:13:38,859] [INFO] [logging.py:107:log_dist] [Rank 0] step=350, skipped=0, lr=[1e-05], mom=[0.0] +steps: 350 loss: 0.0380 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-05 20:13:49,524] [INFO] [logging.py:107:log_dist] [Rank 0] step=351, skipped=0, lr=[1e-05], mom=[0.0] +steps: 351 loss: 0.1042 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 20:14:00,197] [INFO] [logging.py:107:log_dist] [Rank 0] step=352, skipped=0, lr=[1e-05], mom=[0.0] +steps: 352 loss: 0.0577 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:14:10,872] [INFO] [logging.py:107:log_dist] [Rank 0] step=353, skipped=0, lr=[1e-05], mom=[0.0] +steps: 353 loss: 0.1337 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:14:21,541] [INFO] [logging.py:107:log_dist] [Rank 0] step=354, skipped=0, lr=[1e-05], mom=[0.0] +steps: 354 loss: 0.0390 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:14:32,210] [INFO] [logging.py:107:log_dist] [Rank 0] step=355, skipped=0, lr=[1e-05], mom=[0.0] +steps: 355 loss: 0.0704 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:14:42,880] [INFO] [logging.py:107:log_dist] [Rank 0] step=356, skipped=0, lr=[1e-05], mom=[0.0] +steps: 356 loss: 0.1284 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:14:53,563] [INFO] [logging.py:107:log_dist] [Rank 0] step=357, skipped=0, lr=[1e-05], mom=[0.0] +steps: 357 loss: 0.0576 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:15:04,397] [INFO] [logging.py:107:log_dist] [Rank 0] step=358, skipped=0, lr=[1e-05], mom=[0.0] +steps: 358 loss: 0.0430 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-05 20:15:15,077] [INFO] [logging.py:107:log_dist] [Rank 0] step=359, skipped=0, lr=[1e-05], mom=[0.0] +steps: 359 loss: 0.1577 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:15:25,750] [INFO] [logging.py:107:log_dist] [Rank 0] step=360, skipped=0, lr=[1e-05], mom=[0.0] +steps: 360 loss: 0.0688 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:15:36,419] [INFO] [logging.py:107:log_dist] [Rank 0] step=361, skipped=0, lr=[1e-05], mom=[0.0] +steps: 361 loss: 0.0473 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:15:47,088] [INFO] [logging.py:107:log_dist] [Rank 0] step=362, skipped=0, lr=[1e-05], mom=[0.0] +steps: 362 loss: 0.0530 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:15:57,755] [INFO] [logging.py:107:log_dist] [Rank 0] step=363, skipped=0, lr=[1e-05], mom=[0.0] +steps: 363 loss: 0.0773 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:16:08,430] [INFO] [logging.py:107:log_dist] [Rank 0] step=364, skipped=0, lr=[1e-05], mom=[0.0] +steps: 364 loss: 0.0645 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:16:19,103] [INFO] [logging.py:107:log_dist] [Rank 0] step=365, skipped=0, lr=[1e-05], mom=[0.0] +steps: 365 loss: 0.0385 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:16:29,774] [INFO] [logging.py:107:log_dist] [Rank 0] step=366, skipped=0, lr=[1e-05], mom=[0.0] +steps: 366 loss: 0.0362 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:16:40,608] [INFO] [logging.py:107:log_dist] [Rank 0] step=367, skipped=0, lr=[1e-05], mom=[0.0] +steps: 367 loss: 0.1912 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-05 20:16:51,278] [INFO] [logging.py:107:log_dist] [Rank 0] step=368, skipped=0, lr=[1e-05], mom=[0.0] +steps: 368 loss: 0.0831 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:17:01,944] [INFO] [logging.py:107:log_dist] [Rank 0] step=369, skipped=0, lr=[1e-05], mom=[0.0] +steps: 369 loss: 0.0679 iter time (s): 10.639 samples/sec: 0.094 +Started new epoch: 10 +[2025-05-05 20:17:12,959] [INFO] [logging.py:107:log_dist] [Rank 0] step=370, skipped=0, lr=[1e-05], mom=[0.0] +steps: 370 loss: 0.0579 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:17:23,630] [INFO] [logging.py:107:log_dist] [Rank 0] step=371, skipped=0, lr=[1e-05], mom=[0.0] +steps: 371 loss: 0.0866 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:17:34,296] [INFO] [logging.py:107:log_dist] [Rank 0] step=372, skipped=0, lr=[1e-05], mom=[0.0] +steps: 372 loss: 0.1471 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:17:44,964] [INFO] [logging.py:107:log_dist] [Rank 0] step=373, skipped=0, lr=[1e-05], mom=[0.0] +steps: 373 loss: 0.0552 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:17:55,631] [INFO] [logging.py:107:log_dist] [Rank 0] step=374, skipped=0, lr=[1e-05], mom=[0.0] +steps: 374 loss: 0.0362 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:18:06,298] [INFO] [logging.py:107:log_dist] [Rank 0] step=375, skipped=0, lr=[1e-05], mom=[0.0] +steps: 375 loss: 0.0384 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:18:17,131] [INFO] [logging.py:107:log_dist] [Rank 0] step=376, skipped=0, lr=[1e-05], mom=[0.0] +steps: 376 loss: 0.0474 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 20:18:27,801] [INFO] [logging.py:107:log_dist] [Rank 0] step=377, skipped=0, lr=[1e-05], mom=[0.0] +steps: 377 loss: 0.0358 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:18:38,471] [INFO] [logging.py:107:log_dist] [Rank 0] step=378, skipped=0, lr=[1e-05], mom=[0.0] +steps: 378 loss: 0.0742 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:18:49,144] [INFO] [logging.py:107:log_dist] [Rank 0] step=379, skipped=0, lr=[1e-05], mom=[0.0] +steps: 379 loss: 0.0591 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:18:59,812] [INFO] [logging.py:107:log_dist] [Rank 0] step=380, skipped=0, lr=[1e-05], mom=[0.0] +steps: 380 loss: 0.0938 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:19:10,480] [INFO] [logging.py:107:log_dist] [Rank 0] step=381, skipped=0, lr=[1e-05], mom=[0.0] +steps: 381 loss: 0.0307 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:19:21,155] [INFO] [logging.py:107:log_dist] [Rank 0] step=382, skipped=0, lr=[1e-05], mom=[0.0] +steps: 382 loss: 0.0407 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:19:31,834] [INFO] [logging.py:107:log_dist] [Rank 0] step=383, skipped=0, lr=[1e-05], mom=[0.0] +steps: 383 loss: 0.0712 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:19:42,688] [INFO] [logging.py:107:log_dist] [Rank 0] step=384, skipped=0, lr=[1e-05], mom=[0.0] +steps: 384 loss: 0.0256 iter time (s): 10.823 samples/sec: 0.092 +[2025-05-05 20:19:53,362] [INFO] [logging.py:107:log_dist] [Rank 0] step=385, skipped=0, lr=[1e-05], mom=[0.0] +steps: 385 loss: 0.0729 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:20:04,032] [INFO] [logging.py:107:log_dist] [Rank 0] step=386, skipped=0, lr=[1e-05], mom=[0.0] +steps: 386 loss: 0.0404 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:20:14,698] [INFO] [logging.py:107:log_dist] [Rank 0] step=387, skipped=0, lr=[1e-05], mom=[0.0] +steps: 387 loss: 0.0460 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:20:25,376] [INFO] [logging.py:107:log_dist] [Rank 0] step=388, skipped=0, lr=[1e-05], mom=[0.0] +steps: 388 loss: 0.0633 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 20:20:36,050] [INFO] [logging.py:107:log_dist] [Rank 0] step=389, skipped=0, lr=[1e-05], mom=[0.0] +steps: 389 loss: 0.1291 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:20:46,720] [INFO] [logging.py:107:log_dist] [Rank 0] step=390, skipped=0, lr=[1e-05], mom=[0.0] +steps: 390 loss: 0.0400 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:20:57,390] [INFO] [logging.py:107:log_dist] [Rank 0] step=391, skipped=0, lr=[1e-05], mom=[0.0] +steps: 391 loss: 0.0380 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:21:08,058] [INFO] [logging.py:107:log_dist] [Rank 0] step=392, skipped=0, lr=[1e-05], mom=[0.0] +steps: 392 loss: 0.3277 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:21:18,886] [INFO] [logging.py:107:log_dist] [Rank 0] step=393, skipped=0, lr=[1e-05], mom=[0.0] +steps: 393 loss: 0.0472 iter time (s): 10.796 samples/sec: 0.093 +[2025-05-05 20:21:29,555] [INFO] [logging.py:107:log_dist] [Rank 0] step=394, skipped=0, lr=[1e-05], mom=[0.0] +steps: 394 loss: 0.0992 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:21:40,224] [INFO] [logging.py:107:log_dist] [Rank 0] step=395, skipped=0, lr=[1e-05], mom=[0.0] +steps: 395 loss: 0.0463 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:21:50,894] [INFO] [logging.py:107:log_dist] [Rank 0] step=396, skipped=0, lr=[1e-05], mom=[0.0] +steps: 396 loss: 0.0677 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:22:01,567] [INFO] [logging.py:107:log_dist] [Rank 0] step=397, skipped=0, lr=[1e-05], mom=[0.0] +steps: 397 loss: 0.1798 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:22:12,237] [INFO] [logging.py:107:log_dist] [Rank 0] step=398, skipped=0, lr=[1e-05], mom=[0.0] +steps: 398 loss: 0.0747 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:22:22,903] [INFO] [logging.py:107:log_dist] [Rank 0] step=399, skipped=0, lr=[1e-05], mom=[0.0] +steps: 399 loss: 0.1969 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:22:33,577] [INFO] [logging.py:107:log_dist] [Rank 0] step=400, skipped=0, lr=[1e-05], mom=[0.0] +steps: 400 loss: 0.0621 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:22:44,253] [INFO] [logging.py:107:log_dist] [Rank 0] step=401, skipped=0, lr=[1e-05], mom=[0.0] +steps: 401 loss: 0.0668 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:22:55,076] [INFO] [logging.py:107:log_dist] [Rank 0] step=402, skipped=0, lr=[1e-05], mom=[0.0] +steps: 402 loss: 0.0482 iter time (s): 10.791 samples/sec: 0.093 +[2025-05-05 20:23:05,748] [INFO] [logging.py:107:log_dist] [Rank 0] step=403, skipped=0, lr=[1e-05], mom=[0.0] +steps: 403 loss: 0.0558 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:23:16,417] [INFO] [logging.py:107:log_dist] [Rank 0] step=404, skipped=0, lr=[1e-05], mom=[0.0] +steps: 404 loss: 0.0694 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:23:27,086] [INFO] [logging.py:107:log_dist] [Rank 0] step=405, skipped=0, lr=[1e-05], mom=[0.0] +steps: 405 loss: 0.0318 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:23:37,755] [INFO] [logging.py:107:log_dist] [Rank 0] step=406, skipped=0, lr=[1e-05], mom=[0.0] +steps: 406 loss: 0.0521 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:23:48,421] [INFO] [logging.py:107:log_dist] [Rank 0] step=407, skipped=0, lr=[1e-05], mom=[0.0] +steps: 407 loss: 0.2060 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:23:59,085] [INFO] [logging.py:107:log_dist] [Rank 0] step=408, skipped=0, lr=[1e-05], mom=[0.0] +steps: 408 loss: 0.2004 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 20:24:09,757] [INFO] [logging.py:107:log_dist] [Rank 0] step=409, skipped=0, lr=[1e-05], mom=[0.0] +steps: 409 loss: 0.1064 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:24:20,607] [INFO] [logging.py:107:log_dist] [Rank 0] step=410, skipped=0, lr=[1e-05], mom=[0.0] +steps: 410 loss: 0.0503 iter time (s): 10.823 samples/sec: 0.092 +Saving model to directory epoch10 +Started new epoch: 11 +[2025-05-05 20:24:33,291] [INFO] [logging.py:107:log_dist] [Rank 0] step=411, skipped=0, lr=[1e-05], mom=[0.0] +steps: 411 loss: 0.0326 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 20:24:43,961] [INFO] [logging.py:107:log_dist] [Rank 0] step=412, skipped=0, lr=[1e-05], mom=[0.0] +steps: 412 loss: 0.0818 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:24:54,627] [INFO] [logging.py:107:log_dist] [Rank 0] step=413, skipped=0, lr=[1e-05], mom=[0.0] +steps: 413 loss: 0.0596 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:25:05,301] [INFO] [logging.py:107:log_dist] [Rank 0] step=414, skipped=0, lr=[1e-05], mom=[0.0] +steps: 414 loss: 0.1262 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:25:15,975] [INFO] [logging.py:107:log_dist] [Rank 0] step=415, skipped=0, lr=[1e-05], mom=[0.0] +steps: 415 loss: 0.1309 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:25:26,645] [INFO] [logging.py:107:log_dist] [Rank 0] step=416, skipped=0, lr=[1e-05], mom=[0.0] +steps: 416 loss: 0.0861 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:25:37,327] [INFO] [logging.py:107:log_dist] [Rank 0] step=417, skipped=0, lr=[1e-05], mom=[0.0] +steps: 417 loss: 0.1080 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:25:48,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=418, skipped=0, lr=[1e-05], mom=[0.0] +steps: 418 loss: 0.0470 iter time (s): 10.819 samples/sec: 0.092 +[2025-05-05 20:25:58,850] [INFO] [logging.py:107:log_dist] [Rank 0] step=419, skipped=0, lr=[1e-05], mom=[0.0] +steps: 419 loss: 0.0841 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:26:09,524] [INFO] [logging.py:107:log_dist] [Rank 0] step=420, skipped=0, lr=[1e-05], mom=[0.0] +steps: 420 loss: 0.1661 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:26:20,195] [INFO] [logging.py:107:log_dist] [Rank 0] step=421, skipped=0, lr=[1e-05], mom=[0.0] +steps: 421 loss: 0.0715 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:26:30,864] [INFO] [logging.py:107:log_dist] [Rank 0] step=422, skipped=0, lr=[1e-05], mom=[0.0] +steps: 422 loss: 0.0339 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:26:41,539] [INFO] [logging.py:107:log_dist] [Rank 0] step=423, skipped=0, lr=[1e-05], mom=[0.0] +steps: 423 loss: 0.0339 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:26:52,206] [INFO] [logging.py:107:log_dist] [Rank 0] step=424, skipped=0, lr=[1e-05], mom=[0.0] +steps: 424 loss: 0.0616 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:27:02,878] [INFO] [logging.py:107:log_dist] [Rank 0] step=425, skipped=0, lr=[1e-05], mom=[0.0] +steps: 425 loss: 0.2078 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:27:13,553] [INFO] [logging.py:107:log_dist] [Rank 0] step=426, skipped=0, lr=[1e-05], mom=[0.0] +steps: 426 loss: 0.0316 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:27:24,414] [INFO] [logging.py:107:log_dist] [Rank 0] step=427, skipped=0, lr=[1e-05], mom=[0.0] +steps: 427 loss: 0.0294 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-05 20:27:35,082] [INFO] [logging.py:107:log_dist] [Rank 0] step=428, skipped=0, lr=[1e-05], mom=[0.0] +steps: 428 loss: 0.0894 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:27:45,756] [INFO] [logging.py:107:log_dist] [Rank 0] step=429, skipped=0, lr=[1e-05], mom=[0.0] +steps: 429 loss: 0.0524 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:27:56,431] [INFO] [logging.py:107:log_dist] [Rank 0] step=430, skipped=0, lr=[1e-05], mom=[0.0] +steps: 430 loss: 0.0333 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:28:07,102] [INFO] [logging.py:107:log_dist] [Rank 0] step=431, skipped=0, lr=[1e-05], mom=[0.0] +steps: 431 loss: 0.0401 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:28:17,773] [INFO] [logging.py:107:log_dist] [Rank 0] step=432, skipped=0, lr=[1e-05], mom=[0.0] +steps: 432 loss: 0.2425 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:28:28,443] [INFO] [logging.py:107:log_dist] [Rank 0] step=433, skipped=0, lr=[1e-05], mom=[0.0] +steps: 433 loss: 0.0574 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:28:39,112] [INFO] [logging.py:107:log_dist] [Rank 0] step=434, skipped=0, lr=[1e-05], mom=[0.0] +steps: 434 loss: 0.0390 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:28:49,966] [INFO] [logging.py:107:log_dist] [Rank 0] step=435, skipped=0, lr=[1e-05], mom=[0.0] +steps: 435 loss: 0.0521 iter time (s): 10.824 samples/sec: 0.092 +[2025-05-05 20:29:00,632] [INFO] [logging.py:107:log_dist] [Rank 0] step=436, skipped=0, lr=[1e-05], mom=[0.0] +steps: 436 loss: 0.0509 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 20:29:11,302] [INFO] [logging.py:107:log_dist] [Rank 0] step=437, skipped=0, lr=[1e-05], mom=[0.0] +steps: 437 loss: 0.2038 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:29:21,973] [INFO] [logging.py:107:log_dist] [Rank 0] step=438, skipped=0, lr=[1e-05], mom=[0.0] +steps: 438 loss: 0.0347 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:29:32,645] [INFO] [logging.py:107:log_dist] [Rank 0] step=439, skipped=0, lr=[1e-05], mom=[0.0] +steps: 439 loss: 0.0764 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:29:43,311] [INFO] [logging.py:107:log_dist] [Rank 0] step=440, skipped=0, lr=[1e-05], mom=[0.0] +steps: 440 loss: 0.0481 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:29:53,992] [INFO] [logging.py:107:log_dist] [Rank 0] step=441, skipped=0, lr=[1e-05], mom=[0.0] +steps: 441 loss: 0.0331 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 20:30:04,664] [INFO] [logging.py:107:log_dist] [Rank 0] step=442, skipped=0, lr=[1e-05], mom=[0.0] +steps: 442 loss: 0.0914 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:30:15,336] [INFO] [logging.py:107:log_dist] [Rank 0] step=443, skipped=0, lr=[1e-05], mom=[0.0] +steps: 443 loss: 0.0452 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:30:26,176] [INFO] [logging.py:107:log_dist] [Rank 0] step=444, skipped=0, lr=[1e-05], mom=[0.0] +steps: 444 loss: 0.0587 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-05 20:30:36,844] [INFO] [logging.py:107:log_dist] [Rank 0] step=445, skipped=0, lr=[1e-05], mom=[0.0] +steps: 445 loss: 0.0384 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:30:47,522] [INFO] [logging.py:107:log_dist] [Rank 0] step=446, skipped=0, lr=[1e-05], mom=[0.0] +steps: 446 loss: 0.0321 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:30:58,194] [INFO] [logging.py:107:log_dist] [Rank 0] step=447, skipped=0, lr=[1e-05], mom=[0.0] +steps: 447 loss: 0.0721 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:31:08,861] [INFO] [logging.py:107:log_dist] [Rank 0] step=448, skipped=0, lr=[1e-05], mom=[0.0] +steps: 448 loss: 0.0853 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:31:19,533] [INFO] [logging.py:107:log_dist] [Rank 0] step=449, skipped=0, lr=[1e-05], mom=[0.0] +steps: 449 loss: 0.0743 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:31:30,204] [INFO] [logging.py:107:log_dist] [Rank 0] step=450, skipped=0, lr=[1e-05], mom=[0.0] +steps: 450 loss: 0.0409 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:31:40,868] [INFO] [logging.py:107:log_dist] [Rank 0] step=451, skipped=0, lr=[1e-05], mom=[0.0] +steps: 451 loss: 0.0568 iter time (s): 10.636 samples/sec: 0.094 +Started new epoch: 12 +[2025-05-05 20:31:51,870] [INFO] [logging.py:107:log_dist] [Rank 0] step=452, skipped=0, lr=[1e-05], mom=[0.0] +steps: 452 loss: 0.1211 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:32:02,701] [INFO] [logging.py:107:log_dist] [Rank 0] step=453, skipped=0, lr=[1e-05], mom=[0.0] +steps: 453 loss: 0.0616 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-05 20:32:13,368] [INFO] [logging.py:107:log_dist] [Rank 0] step=454, skipped=0, lr=[1e-05], mom=[0.0] +steps: 454 loss: 0.1859 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:32:24,041] [INFO] [logging.py:107:log_dist] [Rank 0] step=455, skipped=0, lr=[1e-05], mom=[0.0] +steps: 455 loss: 0.1015 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:32:34,716] [INFO] [logging.py:107:log_dist] [Rank 0] step=456, skipped=0, lr=[1e-05], mom=[0.0] +steps: 456 loss: 0.0355 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:32:45,391] [INFO] [logging.py:107:log_dist] [Rank 0] step=457, skipped=0, lr=[1e-05], mom=[0.0] +steps: 457 loss: 0.0405 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:32:56,063] [INFO] [logging.py:107:log_dist] [Rank 0] step=458, skipped=0, lr=[1e-05], mom=[0.0] +steps: 458 loss: 0.1187 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:33:06,736] [INFO] [logging.py:107:log_dist] [Rank 0] step=459, skipped=0, lr=[1e-05], mom=[0.0] +steps: 459 loss: 0.2278 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:33:17,407] [INFO] [logging.py:107:log_dist] [Rank 0] step=460, skipped=0, lr=[1e-05], mom=[0.0] +steps: 460 loss: 0.0480 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:33:28,236] [INFO] [logging.py:107:log_dist] [Rank 0] step=461, skipped=0, lr=[1e-05], mom=[0.0] +steps: 461 loss: 0.0479 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-05 20:33:38,906] [INFO] [logging.py:107:log_dist] [Rank 0] step=462, skipped=0, lr=[1e-05], mom=[0.0] +steps: 462 loss: 0.0421 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:33:49,577] [INFO] [logging.py:107:log_dist] [Rank 0] step=463, skipped=0, lr=[1e-05], mom=[0.0] +steps: 463 loss: 0.0402 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:34:00,250] [INFO] [logging.py:107:log_dist] [Rank 0] step=464, skipped=0, lr=[1e-05], mom=[0.0] +steps: 464 loss: 0.0483 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:34:10,920] [INFO] [logging.py:107:log_dist] [Rank 0] step=465, skipped=0, lr=[1e-05], mom=[0.0] +steps: 465 loss: 0.0502 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:34:21,589] [INFO] [logging.py:107:log_dist] [Rank 0] step=466, skipped=0, lr=[1e-05], mom=[0.0] +steps: 466 loss: 0.0930 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:34:32,262] [INFO] [logging.py:107:log_dist] [Rank 0] step=467, skipped=0, lr=[1e-05], mom=[0.0] +steps: 467 loss: 0.0318 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:34:42,929] [INFO] [logging.py:107:log_dist] [Rank 0] step=468, skipped=0, lr=[1e-05], mom=[0.0] +steps: 468 loss: 0.1788 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:34:53,598] [INFO] [logging.py:107:log_dist] [Rank 0] step=469, skipped=0, lr=[1e-05], mom=[0.0] +steps: 469 loss: 0.0977 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:35:04,433] [INFO] [logging.py:107:log_dist] [Rank 0] step=470, skipped=0, lr=[1e-05], mom=[0.0] +steps: 470 loss: 0.0397 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-05 20:35:15,105] [INFO] [logging.py:107:log_dist] [Rank 0] step=471, skipped=0, lr=[1e-05], mom=[0.0] +steps: 471 loss: 0.0305 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:35:25,773] [INFO] [logging.py:107:log_dist] [Rank 0] step=472, skipped=0, lr=[1e-05], mom=[0.0] +steps: 472 loss: 0.0301 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:35:36,445] [INFO] [logging.py:107:log_dist] [Rank 0] step=473, skipped=0, lr=[1e-05], mom=[0.0] +steps: 473 loss: 0.0468 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:35:47,115] [INFO] [logging.py:107:log_dist] [Rank 0] step=474, skipped=0, lr=[1e-05], mom=[0.0] +steps: 474 loss: 0.0750 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:35:57,782] [INFO] [logging.py:107:log_dist] [Rank 0] step=475, skipped=0, lr=[1e-05], mom=[0.0] +steps: 475 loss: 0.0330 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:36:08,458] [INFO] [logging.py:107:log_dist] [Rank 0] step=476, skipped=0, lr=[1e-05], mom=[0.0] +steps: 476 loss: 0.1170 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 20:36:19,126] [INFO] [logging.py:107:log_dist] [Rank 0] step=477, skipped=0, lr=[1e-05], mom=[0.0] +steps: 477 loss: 0.2791 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:36:29,962] [INFO] [logging.py:107:log_dist] [Rank 0] step=478, skipped=0, lr=[1e-05], mom=[0.0] +steps: 478 loss: 0.0314 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 20:36:40,642] [INFO] [logging.py:107:log_dist] [Rank 0] step=479, skipped=0, lr=[1e-05], mom=[0.0] +steps: 479 loss: 0.0401 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 20:36:51,310] [INFO] [logging.py:107:log_dist] [Rank 0] step=480, skipped=0, lr=[1e-05], mom=[0.0] +steps: 480 loss: 0.3176 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:37:01,978] [INFO] [logging.py:107:log_dist] [Rank 0] step=481, skipped=0, lr=[1e-05], mom=[0.0] +steps: 481 loss: 0.0817 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:37:12,657] [INFO] [logging.py:107:log_dist] [Rank 0] step=482, skipped=0, lr=[1e-05], mom=[0.0] +steps: 482 loss: 0.0354 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 20:37:23,334] [INFO] [logging.py:107:log_dist] [Rank 0] step=483, skipped=0, lr=[1e-05], mom=[0.0] +steps: 483 loss: 0.0793 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 20:37:34,004] [INFO] [logging.py:107:log_dist] [Rank 0] step=484, skipped=0, lr=[1e-05], mom=[0.0] +steps: 484 loss: 0.0854 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:37:44,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=485, skipped=0, lr=[1e-05], mom=[0.0] +steps: 485 loss: 0.0420 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:37:55,508] [INFO] [logging.py:107:log_dist] [Rank 0] step=486, skipped=0, lr=[1e-05], mom=[0.0] +steps: 486 loss: 0.0350 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 20:38:06,176] [INFO] [logging.py:107:log_dist] [Rank 0] step=487, skipped=0, lr=[1e-05], mom=[0.0] +steps: 487 loss: 0.0998 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:38:16,846] [INFO] [logging.py:107:log_dist] [Rank 0] step=488, skipped=0, lr=[1e-05], mom=[0.0] +steps: 488 loss: 0.0399 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:38:27,512] [INFO] [logging.py:107:log_dist] [Rank 0] step=489, skipped=0, lr=[1e-05], mom=[0.0] +steps: 489 loss: 0.0924 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:38:38,182] [INFO] [logging.py:107:log_dist] [Rank 0] step=490, skipped=0, lr=[1e-05], mom=[0.0] +steps: 490 loss: 0.0389 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:38:48,853] [INFO] [logging.py:107:log_dist] [Rank 0] step=491, skipped=0, lr=[1e-05], mom=[0.0] +steps: 491 loss: 0.0921 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:38:59,518] [INFO] [logging.py:107:log_dist] [Rank 0] step=492, skipped=0, lr=[1e-05], mom=[0.0] +steps: 492 loss: 0.0553 iter time (s): 10.638 samples/sec: 0.094 +Started new epoch: 13 +[2025-05-05 20:39:10,530] [INFO] [logging.py:107:log_dist] [Rank 0] step=493, skipped=0, lr=[1e-05], mom=[0.0] +steps: 493 loss: 0.2330 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:39:21,203] [INFO] [logging.py:107:log_dist] [Rank 0] step=494, skipped=0, lr=[1e-05], mom=[0.0] +steps: 494 loss: 0.0827 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:39:32,060] [INFO] [logging.py:107:log_dist] [Rank 0] step=495, skipped=0, lr=[1e-05], mom=[0.0] +steps: 495 loss: 0.1260 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-05 20:39:42,732] [INFO] [logging.py:107:log_dist] [Rank 0] step=496, skipped=0, lr=[1e-05], mom=[0.0] +steps: 496 loss: 0.1130 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:39:53,406] [INFO] [logging.py:107:log_dist] [Rank 0] step=497, skipped=0, lr=[1e-05], mom=[0.0] +steps: 497 loss: 0.1157 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:40:04,074] [INFO] [logging.py:107:log_dist] [Rank 0] step=498, skipped=0, lr=[1e-05], mom=[0.0] +steps: 498 loss: 0.0871 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:40:14,746] [INFO] [logging.py:107:log_dist] [Rank 0] step=499, skipped=0, lr=[1e-05], mom=[0.0] +steps: 499 loss: 0.0618 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:40:25,416] [INFO] [logging.py:107:log_dist] [Rank 0] step=500, skipped=0, lr=[1e-05], mom=[0.0] +steps: 500 loss: 0.1142 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:40:36,085] [INFO] [logging.py:107:log_dist] [Rank 0] step=501, skipped=0, lr=[1e-05], mom=[0.0] +steps: 501 loss: 0.0522 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:40:46,755] [INFO] [logging.py:107:log_dist] [Rank 0] step=502, skipped=0, lr=[1e-05], mom=[0.0] +steps: 502 loss: 0.0893 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:40:57,588] [INFO] [logging.py:107:log_dist] [Rank 0] step=503, skipped=0, lr=[1e-05], mom=[0.0] +steps: 503 loss: 0.3749 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-05 20:41:08,255] [INFO] [logging.py:107:log_dist] [Rank 0] step=504, skipped=0, lr=[1e-05], mom=[0.0] +steps: 504 loss: 0.0632 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:41:18,923] [INFO] [logging.py:107:log_dist] [Rank 0] step=505, skipped=0, lr=[1e-05], mom=[0.0] +steps: 505 loss: 0.0843 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:41:29,599] [INFO] [logging.py:107:log_dist] [Rank 0] step=506, skipped=0, lr=[1e-05], mom=[0.0] +steps: 506 loss: 0.1267 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:41:40,267] [INFO] [logging.py:107:log_dist] [Rank 0] step=507, skipped=0, lr=[1e-05], mom=[0.0] +steps: 507 loss: 0.0496 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:41:50,934] [INFO] [logging.py:107:log_dist] [Rank 0] step=508, skipped=0, lr=[1e-05], mom=[0.0] +steps: 508 loss: 0.0371 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:42:01,604] [INFO] [logging.py:107:log_dist] [Rank 0] step=509, skipped=0, lr=[1e-05], mom=[0.0] +steps: 509 loss: 0.0371 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:42:12,278] [INFO] [logging.py:107:log_dist] [Rank 0] step=510, skipped=0, lr=[1e-05], mom=[0.0] +steps: 510 loss: 0.1017 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:42:22,947] [INFO] [logging.py:107:log_dist] [Rank 0] step=511, skipped=0, lr=[1e-05], mom=[0.0] +steps: 511 loss: 0.6419 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:42:33,788] [INFO] [logging.py:107:log_dist] [Rank 0] step=512, skipped=0, lr=[1e-05], mom=[0.0] +steps: 512 loss: 0.0991 iter time (s): 10.809 samples/sec: 0.093 +[2025-05-05 20:42:44,456] [INFO] [logging.py:107:log_dist] [Rank 0] step=513, skipped=0, lr=[1e-05], mom=[0.0] +steps: 513 loss: 0.1027 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:42:55,123] [INFO] [logging.py:107:log_dist] [Rank 0] step=514, skipped=0, lr=[1e-05], mom=[0.0] +steps: 514 loss: 0.0678 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:43:05,795] [INFO] [logging.py:107:log_dist] [Rank 0] step=515, skipped=0, lr=[1e-05], mom=[0.0] +steps: 515 loss: 0.0935 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:43:16,464] [INFO] [logging.py:107:log_dist] [Rank 0] step=516, skipped=0, lr=[1e-05], mom=[0.0] +steps: 516 loss: 0.1130 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:43:27,132] [INFO] [logging.py:107:log_dist] [Rank 0] step=517, skipped=0, lr=[1e-05], mom=[0.0] +steps: 517 loss: 0.0553 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:43:37,804] [INFO] [logging.py:107:log_dist] [Rank 0] step=518, skipped=0, lr=[1e-05], mom=[0.0] +steps: 518 loss: 0.0678 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:43:48,474] [INFO] [logging.py:107:log_dist] [Rank 0] step=519, skipped=0, lr=[1e-05], mom=[0.0] +steps: 519 loss: 0.0757 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:43:59,317] [INFO] [logging.py:107:log_dist] [Rank 0] step=520, skipped=0, lr=[1e-05], mom=[0.0] +steps: 520 loss: 0.0360 iter time (s): 10.848 samples/sec: 0.092 +[2025-05-05 20:44:10,027] [INFO] [logging.py:107:log_dist] [Rank 0] step=521, skipped=0, lr=[1e-05], mom=[0.0] +steps: 521 loss: 0.0354 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:44:20,696] [INFO] [logging.py:107:log_dist] [Rank 0] step=522, skipped=0, lr=[1e-05], mom=[0.0] +steps: 522 loss: 0.0596 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:44:31,371] [INFO] [logging.py:107:log_dist] [Rank 0] step=523, skipped=0, lr=[1e-05], mom=[0.0] +steps: 523 loss: 0.0541 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:44:42,040] [INFO] [logging.py:107:log_dist] [Rank 0] step=524, skipped=0, lr=[1e-05], mom=[0.0] +steps: 524 loss: 0.0417 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:44:52,710] [INFO] [logging.py:107:log_dist] [Rank 0] step=525, skipped=0, lr=[1e-05], mom=[0.0] +steps: 525 loss: 0.0408 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:45:03,385] [INFO] [logging.py:107:log_dist] [Rank 0] step=526, skipped=0, lr=[1e-05], mom=[0.0] +steps: 526 loss: 0.0778 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:45:14,052] [INFO] [logging.py:107:log_dist] [Rank 0] step=527, skipped=0, lr=[1e-05], mom=[0.0] +steps: 527 loss: 0.0680 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:45:24,727] [INFO] [logging.py:107:log_dist] [Rank 0] step=528, skipped=0, lr=[1e-05], mom=[0.0] +steps: 528 loss: 0.1216 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:45:35,581] [INFO] [logging.py:107:log_dist] [Rank 0] step=529, skipped=0, lr=[1e-05], mom=[0.0] +steps: 529 loss: 0.4968 iter time (s): 10.823 samples/sec: 0.092 +[2025-05-05 20:45:46,266] [INFO] [logging.py:107:log_dist] [Rank 0] step=530, skipped=0, lr=[1e-05], mom=[0.0] +steps: 530 loss: 0.0317 iter time (s): 10.654 samples/sec: 0.094 +[2025-05-05 20:45:56,934] [INFO] [logging.py:107:log_dist] [Rank 0] step=531, skipped=0, lr=[1e-05], mom=[0.0] +steps: 531 loss: 0.1727 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:46:07,604] [INFO] [logging.py:107:log_dist] [Rank 0] step=532, skipped=0, lr=[1e-05], mom=[0.0] +steps: 532 loss: 0.0415 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:46:18,268] [INFO] [logging.py:107:log_dist] [Rank 0] step=533, skipped=0, lr=[1e-05], mom=[0.0] +steps: 533 loss: 0.0768 iter time (s): 10.638 samples/sec: 0.094 +Started new epoch: 14 +[2025-05-05 20:46:29,279] [INFO] [logging.py:107:log_dist] [Rank 0] step=534, skipped=0, lr=[1e-05], mom=[0.0] +steps: 534 loss: 0.0445 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:46:39,956] [INFO] [logging.py:107:log_dist] [Rank 0] step=535, skipped=0, lr=[1e-05], mom=[0.0] +steps: 535 loss: 0.3306 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 20:46:50,629] [INFO] [logging.py:107:log_dist] [Rank 0] step=536, skipped=0, lr=[1e-05], mom=[0.0] +steps: 536 loss: 0.0382 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 20:47:01,295] [INFO] [logging.py:107:log_dist] [Rank 0] step=537, skipped=0, lr=[1e-05], mom=[0.0] +steps: 537 loss: 0.1128 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 20:47:12,122] [INFO] [logging.py:107:log_dist] [Rank 0] step=538, skipped=0, lr=[1e-05], mom=[0.0] +steps: 538 loss: 0.1223 iter time (s): 10.796 samples/sec: 0.093 +[2025-05-05 20:47:22,789] [INFO] [logging.py:107:log_dist] [Rank 0] step=539, skipped=0, lr=[1e-05], mom=[0.0] +steps: 539 loss: 0.0363 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:47:33,461] [INFO] [logging.py:107:log_dist] [Rank 0] step=540, skipped=0, lr=[1e-05], mom=[0.0] +steps: 540 loss: 0.1195 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:47:44,133] [INFO] [logging.py:107:log_dist] [Rank 0] step=541, skipped=0, lr=[1e-05], mom=[0.0] +steps: 541 loss: 0.2967 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:47:54,806] [INFO] [logging.py:107:log_dist] [Rank 0] step=542, skipped=0, lr=[1e-05], mom=[0.0] +steps: 542 loss: 0.0616 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:48:05,475] [INFO] [logging.py:107:log_dist] [Rank 0] step=543, skipped=0, lr=[1e-05], mom=[0.0] +steps: 543 loss: 0.1696 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:48:16,147] [INFO] [logging.py:107:log_dist] [Rank 0] step=544, skipped=0, lr=[1e-05], mom=[0.0] +steps: 544 loss: 0.0557 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:48:26,816] [INFO] [logging.py:107:log_dist] [Rank 0] step=545, skipped=0, lr=[1e-05], mom=[0.0] +steps: 545 loss: 0.2735 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:48:37,647] [INFO] [logging.py:107:log_dist] [Rank 0] step=546, skipped=0, lr=[1e-05], mom=[0.0] +steps: 546 loss: 0.0633 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-05 20:48:48,319] [INFO] [logging.py:107:log_dist] [Rank 0] step=547, skipped=0, lr=[1e-05], mom=[0.0] +steps: 547 loss: 0.0388 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:48:58,987] [INFO] [logging.py:107:log_dist] [Rank 0] step=548, skipped=0, lr=[1e-05], mom=[0.0] +steps: 548 loss: 0.0428 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:49:09,655] [INFO] [logging.py:107:log_dist] [Rank 0] step=549, skipped=0, lr=[1e-05], mom=[0.0] +steps: 549 loss: 0.0684 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:49:20,327] [INFO] [logging.py:107:log_dist] [Rank 0] step=550, skipped=0, lr=[1e-05], mom=[0.0] +steps: 550 loss: 0.0470 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:49:31,003] [INFO] [logging.py:107:log_dist] [Rank 0] step=551, skipped=0, lr=[1e-05], mom=[0.0] +steps: 551 loss: 0.0380 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 20:49:41,668] [INFO] [logging.py:107:log_dist] [Rank 0] step=552, skipped=0, lr=[1e-05], mom=[0.0] +steps: 552 loss: 0.0941 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 20:49:52,342] [INFO] [logging.py:107:log_dist] [Rank 0] step=553, skipped=0, lr=[1e-05], mom=[0.0] +steps: 553 loss: 0.0514 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:50:03,175] [INFO] [logging.py:107:log_dist] [Rank 0] step=554, skipped=0, lr=[1e-05], mom=[0.0] +steps: 554 loss: 0.0486 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-05 20:50:13,845] [INFO] [logging.py:107:log_dist] [Rank 0] step=555, skipped=0, lr=[1e-05], mom=[0.0] +steps: 555 loss: 0.0533 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:50:24,521] [INFO] [logging.py:107:log_dist] [Rank 0] step=556, skipped=0, lr=[1e-05], mom=[0.0] +steps: 556 loss: 0.0689 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:50:35,193] [INFO] [logging.py:107:log_dist] [Rank 0] step=557, skipped=0, lr=[1e-05], mom=[0.0] +steps: 557 loss: 0.0874 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:50:45,860] [INFO] [logging.py:107:log_dist] [Rank 0] step=558, skipped=0, lr=[1e-05], mom=[0.0] +steps: 558 loss: 0.0632 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:50:56,534] [INFO] [logging.py:107:log_dist] [Rank 0] step=559, skipped=0, lr=[1e-05], mom=[0.0] +steps: 559 loss: 0.0330 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:51:07,203] [INFO] [logging.py:107:log_dist] [Rank 0] step=560, skipped=0, lr=[1e-05], mom=[0.0] +steps: 560 loss: 0.0749 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:51:17,870] [INFO] [logging.py:107:log_dist] [Rank 0] step=561, skipped=0, lr=[1e-05], mom=[0.0] +steps: 561 loss: 0.0924 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:51:28,544] [INFO] [logging.py:107:log_dist] [Rank 0] step=562, skipped=0, lr=[1e-05], mom=[0.0] +steps: 562 loss: 0.1336 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:51:39,395] [INFO] [logging.py:107:log_dist] [Rank 0] step=563, skipped=0, lr=[1e-05], mom=[0.0] +steps: 563 loss: 0.0342 iter time (s): 10.821 samples/sec: 0.092 +[2025-05-05 20:51:50,064] [INFO] [logging.py:107:log_dist] [Rank 0] step=564, skipped=0, lr=[1e-05], mom=[0.0] +steps: 564 loss: 0.0480 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:52:00,739] [INFO] [logging.py:107:log_dist] [Rank 0] step=565, skipped=0, lr=[1e-05], mom=[0.0] +steps: 565 loss: 0.1940 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:52:11,407] [INFO] [logging.py:107:log_dist] [Rank 0] step=566, skipped=0, lr=[1e-05], mom=[0.0] +steps: 566 loss: 0.2478 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:52:22,077] [INFO] [logging.py:107:log_dist] [Rank 0] step=567, skipped=0, lr=[1e-05], mom=[0.0] +steps: 567 loss: 0.3681 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:52:32,751] [INFO] [logging.py:107:log_dist] [Rank 0] step=568, skipped=0, lr=[1e-05], mom=[0.0] +steps: 568 loss: 0.0551 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:52:43,419] [INFO] [logging.py:107:log_dist] [Rank 0] step=569, skipped=0, lr=[1e-05], mom=[0.0] +steps: 569 loss: 0.2896 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:52:54,086] [INFO] [logging.py:107:log_dist] [Rank 0] step=570, skipped=0, lr=[1e-05], mom=[0.0] +steps: 570 loss: 0.0581 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:53:04,918] [INFO] [logging.py:107:log_dist] [Rank 0] step=571, skipped=0, lr=[1e-05], mom=[0.0] +steps: 571 loss: 0.2306 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 20:53:15,586] [INFO] [logging.py:107:log_dist] [Rank 0] step=572, skipped=0, lr=[1e-05], mom=[0.0] +steps: 572 loss: 0.1386 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:53:26,264] [INFO] [logging.py:107:log_dist] [Rank 0] step=573, skipped=0, lr=[1e-05], mom=[0.0] +steps: 573 loss: 0.1550 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 20:53:36,928] [INFO] [logging.py:107:log_dist] [Rank 0] step=574, skipped=0, lr=[1e-05], mom=[0.0] +steps: 574 loss: 0.0989 iter time (s): 10.637 samples/sec: 0.094 +Started new epoch: 15 +[2025-05-05 20:53:47,933] [INFO] [logging.py:107:log_dist] [Rank 0] step=575, skipped=0, lr=[1e-05], mom=[0.0] +steps: 575 loss: 0.0903 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:53:58,614] [INFO] [logging.py:107:log_dist] [Rank 0] step=576, skipped=0, lr=[1e-05], mom=[0.0] +steps: 576 loss: 0.0793 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-05 20:54:09,288] [INFO] [logging.py:107:log_dist] [Rank 0] step=577, skipped=0, lr=[1e-05], mom=[0.0] +steps: 577 loss: 0.1121 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:54:19,965] [INFO] [logging.py:107:log_dist] [Rank 0] step=578, skipped=0, lr=[1e-05], mom=[0.0] +steps: 578 loss: 0.0718 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:54:30,638] [INFO] [logging.py:107:log_dist] [Rank 0] step=579, skipped=0, lr=[1e-05], mom=[0.0] +steps: 579 loss: 0.0815 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:54:41,476] [INFO] [logging.py:107:log_dist] [Rank 0] step=580, skipped=0, lr=[1e-05], mom=[0.0] +steps: 580 loss: 0.0640 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-05 20:54:52,146] [INFO] [logging.py:107:log_dist] [Rank 0] step=581, skipped=0, lr=[1e-05], mom=[0.0] +steps: 581 loss: 0.0764 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:55:02,818] [INFO] [logging.py:107:log_dist] [Rank 0] step=582, skipped=0, lr=[1e-05], mom=[0.0] +steps: 582 loss: 0.1946 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:55:13,485] [INFO] [logging.py:107:log_dist] [Rank 0] step=583, skipped=0, lr=[1e-05], mom=[0.0] +steps: 583 loss: 0.0692 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 20:55:24,155] [INFO] [logging.py:107:log_dist] [Rank 0] step=584, skipped=0, lr=[1e-05], mom=[0.0] +steps: 584 loss: 0.5869 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:55:34,830] [INFO] [logging.py:107:log_dist] [Rank 0] step=585, skipped=0, lr=[1e-05], mom=[0.0] +steps: 585 loss: 0.0423 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:55:45,499] [INFO] [logging.py:107:log_dist] [Rank 0] step=586, skipped=0, lr=[1e-05], mom=[0.0] +steps: 586 loss: 0.0515 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:55:56,167] [INFO] [logging.py:107:log_dist] [Rank 0] step=587, skipped=0, lr=[1e-05], mom=[0.0] +steps: 587 loss: 0.0480 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:56:06,999] [INFO] [logging.py:107:log_dist] [Rank 0] step=588, skipped=0, lr=[1e-05], mom=[0.0] +steps: 588 loss: 0.2695 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-05 20:56:17,667] [INFO] [logging.py:107:log_dist] [Rank 0] step=589, skipped=0, lr=[1e-05], mom=[0.0] +steps: 589 loss: 0.0255 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:56:28,336] [INFO] [logging.py:107:log_dist] [Rank 0] step=590, skipped=0, lr=[1e-05], mom=[0.0] +steps: 590 loss: 0.0414 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:56:39,010] [INFO] [logging.py:107:log_dist] [Rank 0] step=591, skipped=0, lr=[1e-05], mom=[0.0] +steps: 591 loss: 0.0423 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 20:56:49,682] [INFO] [logging.py:107:log_dist] [Rank 0] step=592, skipped=0, lr=[1e-05], mom=[0.0] +steps: 592 loss: 0.0539 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:57:00,350] [INFO] [logging.py:107:log_dist] [Rank 0] step=593, skipped=0, lr=[1e-05], mom=[0.0] +steps: 593 loss: 0.3278 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:57:11,021] [INFO] [logging.py:107:log_dist] [Rank 0] step=594, skipped=0, lr=[1e-05], mom=[0.0] +steps: 594 loss: 0.0504 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 20:57:21,692] [INFO] [logging.py:107:log_dist] [Rank 0] step=595, skipped=0, lr=[1e-05], mom=[0.0] +steps: 595 loss: 0.0310 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:57:32,366] [INFO] [logging.py:107:log_dist] [Rank 0] step=596, skipped=0, lr=[1e-05], mom=[0.0] +steps: 596 loss: 0.2853 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:57:43,223] [INFO] [logging.py:107:log_dist] [Rank 0] step=597, skipped=0, lr=[1e-05], mom=[0.0] +steps: 597 loss: 0.0740 iter time (s): 10.826 samples/sec: 0.092 +[2025-05-05 20:57:53,892] [INFO] [logging.py:107:log_dist] [Rank 0] step=598, skipped=0, lr=[1e-05], mom=[0.0] +steps: 598 loss: 0.0371 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 20:58:04,557] [INFO] [logging.py:107:log_dist] [Rank 0] step=599, skipped=0, lr=[1e-05], mom=[0.0] +steps: 599 loss: 0.0444 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 20:58:15,231] [INFO] [logging.py:107:log_dist] [Rank 0] step=600, skipped=0, lr=[1e-05], mom=[0.0] +steps: 600 loss: 0.0365 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:58:25,902] [INFO] [logging.py:107:log_dist] [Rank 0] step=601, skipped=0, lr=[1e-05], mom=[0.0] +steps: 601 loss: 0.0730 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:58:36,573] [INFO] [logging.py:107:log_dist] [Rank 0] step=602, skipped=0, lr=[1e-05], mom=[0.0] +steps: 602 loss: 0.0871 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:58:47,245] [INFO] [logging.py:107:log_dist] [Rank 0] step=603, skipped=0, lr=[1e-05], mom=[0.0] +steps: 603 loss: 0.0660 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:58:57,916] [INFO] [logging.py:107:log_dist] [Rank 0] step=604, skipped=0, lr=[1e-05], mom=[0.0] +steps: 604 loss: 0.1146 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 20:59:08,744] [INFO] [logging.py:107:log_dist] [Rank 0] step=605, skipped=0, lr=[1e-05], mom=[0.0] +steps: 605 loss: 0.1272 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-05 20:59:19,415] [INFO] [logging.py:107:log_dist] [Rank 0] step=606, skipped=0, lr=[1e-05], mom=[0.0] +steps: 606 loss: 0.0392 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 20:59:30,088] [INFO] [logging.py:107:log_dist] [Rank 0] step=607, skipped=0, lr=[1e-05], mom=[0.0] +steps: 607 loss: 0.0405 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 20:59:40,767] [INFO] [logging.py:107:log_dist] [Rank 0] step=608, skipped=0, lr=[1e-05], mom=[0.0] +steps: 608 loss: 0.0710 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 20:59:51,441] [INFO] [logging.py:107:log_dist] [Rank 0] step=609, skipped=0, lr=[1e-05], mom=[0.0] +steps: 609 loss: 0.0286 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:00:02,117] [INFO] [logging.py:107:log_dist] [Rank 0] step=610, skipped=0, lr=[1e-05], mom=[0.0] +steps: 610 loss: 0.1653 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:00:12,793] [INFO] [logging.py:107:log_dist] [Rank 0] step=611, skipped=0, lr=[1e-05], mom=[0.0] +steps: 611 loss: 0.0607 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:00:23,470] [INFO] [logging.py:107:log_dist] [Rank 0] step=612, skipped=0, lr=[1e-05], mom=[0.0] +steps: 612 loss: 0.1848 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 21:00:34,142] [INFO] [logging.py:107:log_dist] [Rank 0] step=613, skipped=0, lr=[1e-05], mom=[0.0] +steps: 613 loss: 0.0569 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:00:44,978] [INFO] [logging.py:107:log_dist] [Rank 0] step=614, skipped=0, lr=[1e-05], mom=[0.0] +steps: 614 loss: 0.0351 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 21:00:55,649] [INFO] [logging.py:107:log_dist] [Rank 0] step=615, skipped=0, lr=[1e-05], mom=[0.0] +steps: 615 loss: 0.0920 iter time (s): 10.644 samples/sec: 0.094 +Started new epoch: 16 +[2025-05-05 21:01:06,656] [INFO] [logging.py:107:log_dist] [Rank 0] step=616, skipped=0, lr=[1e-05], mom=[0.0] +steps: 616 loss: 0.1031 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:01:17,324] [INFO] [logging.py:107:log_dist] [Rank 0] step=617, skipped=0, lr=[1e-05], mom=[0.0] +steps: 617 loss: 0.0811 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:01:27,996] [INFO] [logging.py:107:log_dist] [Rank 0] step=618, skipped=0, lr=[1e-05], mom=[0.0] +steps: 618 loss: 0.0373 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:01:38,669] [INFO] [logging.py:107:log_dist] [Rank 0] step=619, skipped=0, lr=[1e-05], mom=[0.0] +steps: 619 loss: 0.0311 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:01:49,337] [INFO] [logging.py:107:log_dist] [Rank 0] step=620, skipped=0, lr=[1e-05], mom=[0.0] +steps: 620 loss: 0.0763 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:02:00,009] [INFO] [logging.py:107:log_dist] [Rank 0] step=621, skipped=0, lr=[1e-05], mom=[0.0] +steps: 621 loss: 0.3424 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:02:10,839] [INFO] [logging.py:107:log_dist] [Rank 0] step=622, skipped=0, lr=[1e-05], mom=[0.0] +steps: 622 loss: 0.0350 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-05 21:02:21,511] [INFO] [logging.py:107:log_dist] [Rank 0] step=623, skipped=0, lr=[1e-05], mom=[0.0] +steps: 623 loss: 0.0313 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:02:32,189] [INFO] [logging.py:107:log_dist] [Rank 0] step=624, skipped=0, lr=[1e-05], mom=[0.0] +steps: 624 loss: 0.0358 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 21:02:42,861] [INFO] [logging.py:107:log_dist] [Rank 0] step=625, skipped=0, lr=[1e-05], mom=[0.0] +steps: 625 loss: 0.1745 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:02:53,528] [INFO] [logging.py:107:log_dist] [Rank 0] step=626, skipped=0, lr=[1e-05], mom=[0.0] +steps: 626 loss: 0.2491 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:03:04,200] [INFO] [logging.py:107:log_dist] [Rank 0] step=627, skipped=0, lr=[1e-05], mom=[0.0] +steps: 627 loss: 0.0275 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:03:14,879] [INFO] [logging.py:107:log_dist] [Rank 0] step=628, skipped=0, lr=[1e-05], mom=[0.0] +steps: 628 loss: 0.2089 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:03:25,564] [INFO] [logging.py:107:log_dist] [Rank 0] step=629, skipped=0, lr=[1e-05], mom=[0.0] +steps: 629 loss: 0.1533 iter time (s): 10.654 samples/sec: 0.094 +[2025-05-05 21:03:36,237] [INFO] [logging.py:107:log_dist] [Rank 0] step=630, skipped=0, lr=[1e-05], mom=[0.0] +steps: 630 loss: 0.0670 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:03:47,100] [INFO] [logging.py:107:log_dist] [Rank 0] step=631, skipped=0, lr=[1e-05], mom=[0.0] +steps: 631 loss: 0.0380 iter time (s): 10.831 samples/sec: 0.092 +[2025-05-05 21:03:57,783] [INFO] [logging.py:107:log_dist] [Rank 0] step=632, skipped=0, lr=[1e-05], mom=[0.0] +steps: 632 loss: 0.0478 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 21:04:08,456] [INFO] [logging.py:107:log_dist] [Rank 0] step=633, skipped=0, lr=[1e-05], mom=[0.0] +steps: 633 loss: 0.0725 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:04:19,122] [INFO] [logging.py:107:log_dist] [Rank 0] step=634, skipped=0, lr=[1e-05], mom=[0.0] +steps: 634 loss: 0.3156 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 21:04:29,797] [INFO] [logging.py:107:log_dist] [Rank 0] step=635, skipped=0, lr=[1e-05], mom=[0.0] +steps: 635 loss: 0.0395 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:04:40,465] [INFO] [logging.py:107:log_dist] [Rank 0] step=636, skipped=0, lr=[1e-05], mom=[0.0] +steps: 636 loss: 0.0871 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:04:51,140] [INFO] [logging.py:107:log_dist] [Rank 0] step=637, skipped=0, lr=[1e-05], mom=[0.0] +steps: 637 loss: 0.0353 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:05:01,822] [INFO] [logging.py:107:log_dist] [Rank 0] step=638, skipped=0, lr=[1e-05], mom=[0.0] +steps: 638 loss: 0.2232 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-05 21:05:12,657] [INFO] [logging.py:107:log_dist] [Rank 0] step=639, skipped=0, lr=[1e-05], mom=[0.0] +steps: 639 loss: 0.0336 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-05 21:05:23,326] [INFO] [logging.py:107:log_dist] [Rank 0] step=640, skipped=0, lr=[1e-05], mom=[0.0] +steps: 640 loss: 0.0676 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:05:34,004] [INFO] [logging.py:107:log_dist] [Rank 0] step=641, skipped=0, lr=[1e-05], mom=[0.0] +steps: 641 loss: 0.0493 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 21:05:44,677] [INFO] [logging.py:107:log_dist] [Rank 0] step=642, skipped=0, lr=[1e-05], mom=[0.0] +steps: 642 loss: 0.1063 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:05:55,346] [INFO] [logging.py:107:log_dist] [Rank 0] step=643, skipped=0, lr=[1e-05], mom=[0.0] +steps: 643 loss: 0.0284 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:06:06,020] [INFO] [logging.py:107:log_dist] [Rank 0] step=644, skipped=0, lr=[1e-05], mom=[0.0] +steps: 644 loss: 0.2544 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:06:16,688] [INFO] [logging.py:107:log_dist] [Rank 0] step=645, skipped=0, lr=[1e-05], mom=[0.0] +steps: 645 loss: 0.1466 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:06:27,359] [INFO] [logging.py:107:log_dist] [Rank 0] step=646, skipped=0, lr=[1e-05], mom=[0.0] +steps: 646 loss: 0.0313 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:06:38,042] [INFO] [logging.py:107:log_dist] [Rank 0] step=647, skipped=0, lr=[1e-05], mom=[0.0] +steps: 647 loss: 0.2123 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-05 21:06:48,878] [INFO] [logging.py:107:log_dist] [Rank 0] step=648, skipped=0, lr=[1e-05], mom=[0.0] +steps: 648 loss: 0.0324 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-05 21:06:59,544] [INFO] [logging.py:107:log_dist] [Rank 0] step=649, skipped=0, lr=[1e-05], mom=[0.0] +steps: 649 loss: 0.0436 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 21:07:10,213] [INFO] [logging.py:107:log_dist] [Rank 0] step=650, skipped=0, lr=[1e-05], mom=[0.0] +steps: 650 loss: 0.0627 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:07:20,881] [INFO] [logging.py:107:log_dist] [Rank 0] step=651, skipped=0, lr=[1e-05], mom=[0.0] +steps: 651 loss: 0.0466 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:07:31,550] [INFO] [logging.py:107:log_dist] [Rank 0] step=652, skipped=0, lr=[1e-05], mom=[0.0] +steps: 652 loss: 0.1787 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:07:42,221] [INFO] [logging.py:107:log_dist] [Rank 0] step=653, skipped=0, lr=[1e-05], mom=[0.0] +steps: 653 loss: 0.0714 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:07:52,891] [INFO] [logging.py:107:log_dist] [Rank 0] step=654, skipped=0, lr=[1e-05], mom=[0.0] +steps: 654 loss: 0.1214 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:08:03,562] [INFO] [logging.py:107:log_dist] [Rank 0] step=655, skipped=0, lr=[1e-05], mom=[0.0] +steps: 655 loss: 0.0862 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:08:14,388] [INFO] [logging.py:107:log_dist] [Rank 0] step=656, skipped=0, lr=[1e-05], mom=[0.0] +steps: 656 loss: 0.1403 iter time (s): 10.799 samples/sec: 0.093 +Started new epoch: 17 +[2025-05-05 21:08:25,409] [INFO] [logging.py:107:log_dist] [Rank 0] step=657, skipped=0, lr=[1e-05], mom=[0.0] +steps: 657 loss: 0.0757 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:08:36,076] [INFO] [logging.py:107:log_dist] [Rank 0] step=658, skipped=0, lr=[1e-05], mom=[0.0] +steps: 658 loss: 0.0359 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:08:46,756] [INFO] [logging.py:107:log_dist] [Rank 0] step=659, skipped=0, lr=[1e-05], mom=[0.0] +steps: 659 loss: 0.0508 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:08:57,427] [INFO] [logging.py:107:log_dist] [Rank 0] step=660, skipped=0, lr=[1e-05], mom=[0.0] +steps: 660 loss: 0.0362 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:09:08,098] [INFO] [logging.py:107:log_dist] [Rank 0] step=661, skipped=0, lr=[1e-05], mom=[0.0] +steps: 661 loss: 0.1029 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:09:18,770] [INFO] [logging.py:107:log_dist] [Rank 0] step=662, skipped=0, lr=[1e-05], mom=[0.0] +steps: 662 loss: 0.0312 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:09:29,439] [INFO] [logging.py:107:log_dist] [Rank 0] step=663, skipped=0, lr=[1e-05], mom=[0.0] +steps: 663 loss: 0.0351 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:09:40,106] [INFO] [logging.py:107:log_dist] [Rank 0] step=664, skipped=0, lr=[1e-05], mom=[0.0] +steps: 664 loss: 0.0648 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:09:50,961] [INFO] [logging.py:107:log_dist] [Rank 0] step=665, skipped=0, lr=[1e-05], mom=[0.0] +steps: 665 loss: 0.0345 iter time (s): 10.824 samples/sec: 0.092 +[2025-05-05 21:10:01,646] [INFO] [logging.py:107:log_dist] [Rank 0] step=666, skipped=0, lr=[1e-05], mom=[0.0] +steps: 666 loss: 0.0813 iter time (s): 10.654 samples/sec: 0.094 +[2025-05-05 21:10:12,319] [INFO] [logging.py:107:log_dist] [Rank 0] step=667, skipped=0, lr=[1e-05], mom=[0.0] +steps: 667 loss: 0.0689 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:10:22,989] [INFO] [logging.py:107:log_dist] [Rank 0] step=668, skipped=0, lr=[1e-05], mom=[0.0] +steps: 668 loss: 0.0791 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:10:33,659] [INFO] [logging.py:107:log_dist] [Rank 0] step=669, skipped=0, lr=[1e-05], mom=[0.0] +steps: 669 loss: 0.0325 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:10:44,332] [INFO] [logging.py:107:log_dist] [Rank 0] step=670, skipped=0, lr=[1e-05], mom=[0.0] +steps: 670 loss: 0.6210 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:10:55,011] [INFO] [logging.py:107:log_dist] [Rank 0] step=671, skipped=0, lr=[1e-05], mom=[0.0] +steps: 671 loss: 0.0735 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 21:11:05,681] [INFO] [logging.py:107:log_dist] [Rank 0] step=672, skipped=0, lr=[1e-05], mom=[0.0] +steps: 672 loss: 0.0591 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:11:16,522] [INFO] [logging.py:107:log_dist] [Rank 0] step=673, skipped=0, lr=[1e-05], mom=[0.0] +steps: 673 loss: 0.2228 iter time (s): 10.809 samples/sec: 0.093 +[2025-05-05 21:11:27,199] [INFO] [logging.py:107:log_dist] [Rank 0] step=674, skipped=0, lr=[1e-05], mom=[0.0] +steps: 674 loss: 0.0882 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:11:27,202] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step674 is about to be saved! +[2025-05-05 21:11:27,215] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_00-model_states.pt... +[2025-05-05 21:11:27,216] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_00-model_states.pt. +[2025-05-05 21:11:27,223] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_01-model_states.pt... +[2025-05-05 21:11:27,229] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_01-model_states.pt. +[2025-05-05 21:11:27,235] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_02-model_states.pt... +[2025-05-05 21:11:27,242] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_02-model_states.pt. +[2025-05-05 21:11:27,246] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_03-model_states.pt... +[2025-05-05 21:11:27,252] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_03-model_states.pt. +[2025-05-05 21:11:27,256] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_04-model_states.pt... +[2025-05-05 21:11:27,263] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_04-model_states.pt. +[2025-05-05 21:11:27,266] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_05-model_states.pt... +[2025-05-05 21:11:27,273] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_05-model_states.pt. +[2025-05-05 21:11:27,276] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_06-model_states.pt... +[2025-05-05 21:11:27,283] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_06-model_states.pt. +[2025-05-05 21:11:27,286] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_07-model_states.pt... +[2025-05-05 21:11:27,292] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_07-model_states.pt. +[2025-05-05 21:11:27,296] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_08-model_states.pt... +[2025-05-05 21:11:27,302] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_08-model_states.pt. +[2025-05-05 21:11:27,306] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_09-model_states.pt... +[2025-05-05 21:11:27,312] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_09-model_states.pt. +[2025-05-05 21:11:27,316] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_10-model_states.pt... +[2025-05-05 21:11:27,322] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_10-model_states.pt. +[2025-05-05 21:11:27,325] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_11-model_states.pt... +[2025-05-05 21:11:27,332] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_11-model_states.pt. +[2025-05-05 21:11:27,335] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_12-model_states.pt... +[2025-05-05 21:11:27,342] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_12-model_states.pt. +[2025-05-05 21:11:27,345] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_13-model_states.pt... +[2025-05-05 21:11:27,352] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_13-model_states.pt. +[2025-05-05 21:11:27,355] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_14-model_states.pt... +[2025-05-05 21:11:27,362] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_14-model_states.pt. +[2025-05-05 21:11:27,365] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_15-model_states.pt... +[2025-05-05 21:11:27,371] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_15-model_states.pt. +[2025-05-05 21:11:27,375] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_16-model_states.pt... +[2025-05-05 21:11:27,382] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_16-model_states.pt. +[2025-05-05 21:11:27,385] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_17-model_states.pt... +[2025-05-05 21:11:27,392] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_17-model_states.pt. +[2025-05-05 21:11:27,395] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_18-model_states.pt... +[2025-05-05 21:11:27,402] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_18-model_states.pt. +[2025-05-05 21:11:27,405] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_19-model_states.pt... +[2025-05-05 21:11:27,412] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_19-model_states.pt. +[2025-05-05 21:11:27,416] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_20-model_states.pt... +[2025-05-05 21:11:27,422] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_20-model_states.pt. +[2025-05-05 21:11:27,426] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_21-model_states.pt... +[2025-05-05 21:11:27,432] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_21-model_states.pt. +[2025-05-05 21:11:27,435] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_22-model_states.pt... +[2025-05-05 21:11:27,442] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_22-model_states.pt. +[2025-05-05 21:11:27,445] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_23-model_states.pt... +[2025-05-05 21:11:27,452] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_23-model_states.pt. +[2025-05-05 21:11:27,455] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_24-model_states.pt... +[2025-05-05 21:11:27,461] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_24-model_states.pt. +[2025-05-05 21:11:27,465] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_25-model_states.pt... +[2025-05-05 21:11:27,471] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_25-model_states.pt. +[2025-05-05 21:11:27,475] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_26-model_states.pt... +[2025-05-05 21:11:27,481] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_26-model_states.pt. +[2025-05-05 21:11:27,484] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_27-model_states.pt... +[2025-05-05 21:11:27,491] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_27-model_states.pt. +[2025-05-05 21:11:27,494] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_28-model_states.pt... +[2025-05-05 21:11:27,501] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_28-model_states.pt. +[2025-05-05 21:11:27,504] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_29-model_states.pt... +[2025-05-05 21:11:27,511] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_29-model_states.pt. +[2025-05-05 21:11:27,514] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_30-model_states.pt... +[2025-05-05 21:11:27,520] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_30-model_states.pt. +[2025-05-05 21:11:27,524] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_31-model_states.pt... +[2025-05-05 21:11:27,530] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_31-model_states.pt. +[2025-05-05 21:11:27,533] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_32-model_states.pt... +[2025-05-05 21:11:27,540] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_32-model_states.pt. +[2025-05-05 21:11:27,543] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_33-model_states.pt... +[2025-05-05 21:11:27,550] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_33-model_states.pt. +[2025-05-05 21:11:27,553] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_34-model_states.pt... +[2025-05-05 21:11:27,560] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_34-model_states.pt. +[2025-05-05 21:11:27,563] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_35-model_states.pt... +[2025-05-05 21:11:27,570] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_35-model_states.pt. +[2025-05-05 21:11:27,573] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_36-model_states.pt... +[2025-05-05 21:11:27,580] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_36-model_states.pt. +[2025-05-05 21:11:27,583] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_37-model_states.pt... +[2025-05-05 21:11:27,590] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_37-model_states.pt. +[2025-05-05 21:11:27,593] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_38-model_states.pt... +[2025-05-05 21:11:27,599] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_38-model_states.pt. +[2025-05-05 21:11:27,603] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_39-model_states.pt... +[2025-05-05 21:11:27,609] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_39-model_states.pt. +[2025-05-05 21:11:27,613] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_40-model_states.pt... +[2025-05-05 21:11:27,619] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_40-model_states.pt. +[2025-05-05 21:11:27,620] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_41-model_states.pt... +[2025-05-05 21:11:27,620] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/layer_41-model_states.pt. +[2025-05-05 21:11:27,638] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/mp_rank_00_model_states.pt +[2025-05-05 21:11:27,638] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/mp_rank_00_model_states.pt... +[2025-05-05 21:11:28,650] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step674/mp_rank_00_model_states.pt. +[2025-05-05 21:11:28,651] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step674 is ready now! +[2025-05-05 21:11:39,320] [INFO] [logging.py:107:log_dist] [Rank 0] step=675, skipped=0, lr=[1e-05], mom=[0.0] +steps: 675 loss: 0.0805 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:11:49,988] [INFO] [logging.py:107:log_dist] [Rank 0] step=676, skipped=0, lr=[1e-05], mom=[0.0] +steps: 676 loss: 0.0883 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:12:00,661] [INFO] [logging.py:107:log_dist] [Rank 0] step=677, skipped=0, lr=[1e-05], mom=[0.0] +steps: 677 loss: 0.2233 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:12:11,331] [INFO] [logging.py:107:log_dist] [Rank 0] step=678, skipped=0, lr=[1e-05], mom=[0.0] +steps: 678 loss: 0.0354 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:12:22,006] [INFO] [logging.py:107:log_dist] [Rank 0] step=679, skipped=0, lr=[1e-05], mom=[0.0] +steps: 679 loss: 0.0596 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:12:32,680] [INFO] [logging.py:107:log_dist] [Rank 0] step=680, skipped=0, lr=[1e-05], mom=[0.0] +steps: 680 loss: 0.0413 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:12:43,537] [INFO] [logging.py:107:log_dist] [Rank 0] step=681, skipped=0, lr=[1e-05], mom=[0.0] +steps: 681 loss: 0.0535 iter time (s): 10.825 samples/sec: 0.092 +[2025-05-05 21:12:54,208] [INFO] [logging.py:107:log_dist] [Rank 0] step=682, skipped=0, lr=[1e-05], mom=[0.0] +steps: 682 loss: 0.0770 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:13:04,881] [INFO] [logging.py:107:log_dist] [Rank 0] step=683, skipped=0, lr=[1e-05], mom=[0.0] +steps: 683 loss: 0.0399 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:13:15,551] [INFO] [logging.py:107:log_dist] [Rank 0] step=684, skipped=0, lr=[1e-05], mom=[0.0] +steps: 684 loss: 0.1328 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:13:26,220] [INFO] [logging.py:107:log_dist] [Rank 0] step=685, skipped=0, lr=[1e-05], mom=[0.0] +steps: 685 loss: 0.1337 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:13:36,889] [INFO] [logging.py:107:log_dist] [Rank 0] step=686, skipped=0, lr=[1e-05], mom=[0.0] +steps: 686 loss: 0.0413 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:13:47,560] [INFO] [logging.py:107:log_dist] [Rank 0] step=687, skipped=0, lr=[1e-05], mom=[0.0] +steps: 687 loss: 0.0682 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:13:58,232] [INFO] [logging.py:107:log_dist] [Rank 0] step=688, skipped=0, lr=[1e-05], mom=[0.0] +steps: 688 loss: 0.0564 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:14:08,900] [INFO] [logging.py:107:log_dist] [Rank 0] step=689, skipped=0, lr=[1e-05], mom=[0.0] +steps: 689 loss: 0.0479 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:14:19,765] [INFO] [logging.py:107:log_dist] [Rank 0] step=690, skipped=0, lr=[1e-05], mom=[0.0] +steps: 690 loss: 0.0589 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-05 21:14:30,442] [INFO] [logging.py:107:log_dist] [Rank 0] step=691, skipped=0, lr=[1e-05], mom=[0.0] +steps: 691 loss: 0.0396 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 21:14:41,112] [INFO] [logging.py:107:log_dist] [Rank 0] step=692, skipped=0, lr=[1e-05], mom=[0.0] +steps: 692 loss: 0.0471 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:14:51,783] [INFO] [logging.py:107:log_dist] [Rank 0] step=693, skipped=0, lr=[1e-05], mom=[0.0] +steps: 693 loss: 0.0691 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:15:02,468] [INFO] [logging.py:107:log_dist] [Rank 0] step=694, skipped=0, lr=[1e-05], mom=[0.0] +steps: 694 loss: 0.0294 iter time (s): 10.655 samples/sec: 0.094 +[2025-05-05 21:15:13,140] [INFO] [logging.py:107:log_dist] [Rank 0] step=695, skipped=0, lr=[1e-05], mom=[0.0] +steps: 695 loss: 0.2688 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:15:23,809] [INFO] [logging.py:107:log_dist] [Rank 0] step=696, skipped=0, lr=[1e-05], mom=[0.0] +steps: 696 loss: 0.2748 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:15:34,479] [INFO] [logging.py:107:log_dist] [Rank 0] step=697, skipped=0, lr=[1e-05], mom=[0.0] +steps: 697 loss: 0.0877 iter time (s): 10.643 samples/sec: 0.094 +Started new epoch: 18 +[2025-05-05 21:15:45,483] [INFO] [logging.py:107:log_dist] [Rank 0] step=698, skipped=0, lr=[1e-05], mom=[0.0] +steps: 698 loss: 0.0308 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:15:56,313] [INFO] [logging.py:107:log_dist] [Rank 0] step=699, skipped=0, lr=[1e-05], mom=[0.0] +steps: 699 loss: 0.0487 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-05 21:16:06,984] [INFO] [logging.py:107:log_dist] [Rank 0] step=700, skipped=0, lr=[1e-05], mom=[0.0] +steps: 700 loss: 0.1042 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:16:17,654] [INFO] [logging.py:107:log_dist] [Rank 0] step=701, skipped=0, lr=[1e-05], mom=[0.0] +steps: 701 loss: 0.0495 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:16:28,324] [INFO] [logging.py:107:log_dist] [Rank 0] step=702, skipped=0, lr=[1e-05], mom=[0.0] +steps: 702 loss: 0.3707 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:16:38,997] [INFO] [logging.py:107:log_dist] [Rank 0] step=703, skipped=0, lr=[1e-05], mom=[0.0] +steps: 703 loss: 0.1954 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:16:49,665] [INFO] [logging.py:107:log_dist] [Rank 0] step=704, skipped=0, lr=[1e-05], mom=[0.0] +steps: 704 loss: 0.2926 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:17:00,336] [INFO] [logging.py:107:log_dist] [Rank 0] step=705, skipped=0, lr=[1e-05], mom=[0.0] +steps: 705 loss: 0.1135 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:17:11,014] [INFO] [logging.py:107:log_dist] [Rank 0] step=706, skipped=0, lr=[1e-05], mom=[0.0] +steps: 706 loss: 0.0534 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 21:17:21,873] [INFO] [logging.py:107:log_dist] [Rank 0] step=707, skipped=0, lr=[1e-05], mom=[0.0] +steps: 707 loss: 0.2202 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-05 21:17:32,548] [INFO] [logging.py:107:log_dist] [Rank 0] step=708, skipped=0, lr=[1e-05], mom=[0.0] +steps: 708 loss: 0.0700 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:17:43,220] [INFO] [logging.py:107:log_dist] [Rank 0] step=709, skipped=0, lr=[1e-05], mom=[0.0] +steps: 709 loss: 0.0349 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:17:53,889] [INFO] [logging.py:107:log_dist] [Rank 0] step=710, skipped=0, lr=[1e-05], mom=[0.0] +steps: 710 loss: 0.0328 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:18:04,560] [INFO] [logging.py:107:log_dist] [Rank 0] step=711, skipped=0, lr=[1e-05], mom=[0.0] +steps: 711 loss: 0.0783 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:18:15,231] [INFO] [logging.py:107:log_dist] [Rank 0] step=712, skipped=0, lr=[1e-05], mom=[0.0] +steps: 712 loss: 0.0591 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:18:25,905] [INFO] [logging.py:107:log_dist] [Rank 0] step=713, skipped=0, lr=[1e-05], mom=[0.0] +steps: 713 loss: 0.0382 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:18:36,576] [INFO] [logging.py:107:log_dist] [Rank 0] step=714, skipped=0, lr=[1e-05], mom=[0.0] +steps: 714 loss: 0.0373 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:18:47,249] [INFO] [logging.py:107:log_dist] [Rank 0] step=715, skipped=0, lr=[1e-05], mom=[0.0] +steps: 715 loss: 0.1424 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:18:58,108] [INFO] [logging.py:107:log_dist] [Rank 0] step=716, skipped=0, lr=[1e-05], mom=[0.0] +steps: 716 loss: 0.0703 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-05 21:19:08,779] [INFO] [logging.py:107:log_dist] [Rank 0] step=717, skipped=0, lr=[1e-05], mom=[0.0] +steps: 717 loss: 0.0412 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:19:19,455] [INFO] [logging.py:107:log_dist] [Rank 0] step=718, skipped=0, lr=[1e-05], mom=[0.0] +steps: 718 loss: 0.0319 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:19:30,122] [INFO] [logging.py:107:log_dist] [Rank 0] step=719, skipped=0, lr=[1e-05], mom=[0.0] +steps: 719 loss: 0.1645 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:19:40,789] [INFO] [logging.py:107:log_dist] [Rank 0] step=720, skipped=0, lr=[1e-05], mom=[0.0] +steps: 720 loss: 0.0526 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:19:51,459] [INFO] [logging.py:107:log_dist] [Rank 0] step=721, skipped=0, lr=[1e-05], mom=[0.0] +steps: 721 loss: 0.1682 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:20:02,131] [INFO] [logging.py:107:log_dist] [Rank 0] step=722, skipped=0, lr=[1e-05], mom=[0.0] +steps: 722 loss: 0.2012 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:20:12,805] [INFO] [logging.py:107:log_dist] [Rank 0] step=723, skipped=0, lr=[1e-05], mom=[0.0] +steps: 723 loss: 0.0320 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:20:23,478] [INFO] [logging.py:107:log_dist] [Rank 0] step=724, skipped=0, lr=[1e-05], mom=[0.0] +steps: 724 loss: 0.0797 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:20:34,304] [INFO] [logging.py:107:log_dist] [Rank 0] step=725, skipped=0, lr=[1e-05], mom=[0.0] +steps: 725 loss: 0.1138 iter time (s): 10.796 samples/sec: 0.093 +[2025-05-05 21:20:44,974] [INFO] [logging.py:107:log_dist] [Rank 0] step=726, skipped=0, lr=[1e-05], mom=[0.0] +steps: 726 loss: 0.0590 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:20:55,646] [INFO] [logging.py:107:log_dist] [Rank 0] step=727, skipped=0, lr=[1e-05], mom=[0.0] +steps: 727 loss: 0.1172 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:21:06,316] [INFO] [logging.py:107:log_dist] [Rank 0] step=728, skipped=0, lr=[1e-05], mom=[0.0] +steps: 728 loss: 0.0292 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:21:16,993] [INFO] [logging.py:107:log_dist] [Rank 0] step=729, skipped=0, lr=[1e-05], mom=[0.0] +steps: 729 loss: 0.1028 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 21:21:27,669] [INFO] [logging.py:107:log_dist] [Rank 0] step=730, skipped=0, lr=[1e-05], mom=[0.0] +steps: 730 loss: 0.0333 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:21:38,340] [INFO] [logging.py:107:log_dist] [Rank 0] step=731, skipped=0, lr=[1e-05], mom=[0.0] +steps: 731 loss: 0.0799 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:21:49,022] [INFO] [logging.py:107:log_dist] [Rank 0] step=732, skipped=0, lr=[1e-05], mom=[0.0] +steps: 732 loss: 0.0862 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:21:59,863] [INFO] [logging.py:107:log_dist] [Rank 0] step=733, skipped=0, lr=[1e-05], mom=[0.0] +steps: 733 loss: 0.0702 iter time (s): 10.810 samples/sec: 0.093 +[2025-05-05 21:22:10,532] [INFO] [logging.py:107:log_dist] [Rank 0] step=734, skipped=0, lr=[1e-05], mom=[0.0] +steps: 734 loss: 0.1134 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:22:21,199] [INFO] [logging.py:107:log_dist] [Rank 0] step=735, skipped=0, lr=[1e-05], mom=[0.0] +steps: 735 loss: 0.0394 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:22:31,869] [INFO] [logging.py:107:log_dist] [Rank 0] step=736, skipped=0, lr=[1e-05], mom=[0.0] +steps: 736 loss: 0.0730 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:22:42,537] [INFO] [logging.py:107:log_dist] [Rank 0] step=737, skipped=0, lr=[1e-05], mom=[0.0] +steps: 737 loss: 0.0373 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:22:53,211] [INFO] [logging.py:107:log_dist] [Rank 0] step=738, skipped=0, lr=[1e-05], mom=[0.0] +steps: 738 loss: 0.3598 iter time (s): 10.646 samples/sec: 0.094 +Started new epoch: 19 +[2025-05-05 21:23:04,236] [INFO] [logging.py:107:log_dist] [Rank 0] step=739, skipped=0, lr=[1e-05], mom=[0.0] +steps: 739 loss: 0.0651 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:23:14,906] [INFO] [logging.py:107:log_dist] [Rank 0] step=740, skipped=0, lr=[1e-05], mom=[0.0] +steps: 740 loss: 0.1073 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:23:25,580] [INFO] [logging.py:107:log_dist] [Rank 0] step=741, skipped=0, lr=[1e-05], mom=[0.0] +steps: 741 loss: 0.1316 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:23:36,417] [INFO] [logging.py:107:log_dist] [Rank 0] step=742, skipped=0, lr=[1e-05], mom=[0.0] +steps: 742 loss: 0.0434 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-05 21:23:47,085] [INFO] [logging.py:107:log_dist] [Rank 0] step=743, skipped=0, lr=[1e-05], mom=[0.0] +steps: 743 loss: 0.1249 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:23:57,757] [INFO] [logging.py:107:log_dist] [Rank 0] step=744, skipped=0, lr=[1e-05], mom=[0.0] +steps: 744 loss: 0.0732 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:24:08,434] [INFO] [logging.py:107:log_dist] [Rank 0] step=745, skipped=0, lr=[1e-05], mom=[0.0] +steps: 745 loss: 0.0855 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:24:19,105] [INFO] [logging.py:107:log_dist] [Rank 0] step=746, skipped=0, lr=[1e-05], mom=[0.0] +steps: 746 loss: 0.0650 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:24:29,780] [INFO] [logging.py:107:log_dist] [Rank 0] step=747, skipped=0, lr=[1e-05], mom=[0.0] +steps: 747 loss: 0.0490 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:24:40,462] [INFO] [logging.py:107:log_dist] [Rank 0] step=748, skipped=0, lr=[1e-05], mom=[0.0] +steps: 748 loss: 0.0359 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:24:51,131] [INFO] [logging.py:107:log_dist] [Rank 0] step=749, skipped=0, lr=[1e-05], mom=[0.0] +steps: 749 loss: 0.1046 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:25:01,993] [INFO] [logging.py:107:log_dist] [Rank 0] step=750, skipped=0, lr=[1e-05], mom=[0.0] +steps: 750 loss: 0.0688 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-05 21:25:12,667] [INFO] [logging.py:107:log_dist] [Rank 0] step=751, skipped=0, lr=[1e-05], mom=[0.0] +steps: 751 loss: 0.1554 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:25:23,337] [INFO] [logging.py:107:log_dist] [Rank 0] step=752, skipped=0, lr=[1e-05], mom=[0.0] +steps: 752 loss: 0.0330 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:25:34,010] [INFO] [logging.py:107:log_dist] [Rank 0] step=753, skipped=0, lr=[1e-05], mom=[0.0] +steps: 753 loss: 0.0249 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:25:44,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=754, skipped=0, lr=[1e-05], mom=[0.0] +steps: 754 loss: 0.0445 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 21:25:55,355] [INFO] [logging.py:107:log_dist] [Rank 0] step=755, skipped=0, lr=[1e-05], mom=[0.0] +steps: 755 loss: 0.0654 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:26:06,025] [INFO] [logging.py:107:log_dist] [Rank 0] step=756, skipped=0, lr=[1e-05], mom=[0.0] +steps: 756 loss: 0.0354 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:26:16,696] [INFO] [logging.py:107:log_dist] [Rank 0] step=757, skipped=0, lr=[1e-05], mom=[0.0] +steps: 757 loss: 0.3906 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:26:27,366] [INFO] [logging.py:107:log_dist] [Rank 0] step=758, skipped=0, lr=[1e-05], mom=[0.0] +steps: 758 loss: 0.0446 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:26:38,229] [INFO] [logging.py:107:log_dist] [Rank 0] step=759, skipped=0, lr=[1e-05], mom=[0.0] +steps: 759 loss: 0.0812 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-05 21:26:48,898] [INFO] [logging.py:107:log_dist] [Rank 0] step=760, skipped=0, lr=[1e-05], mom=[0.0] +steps: 760 loss: 0.0953 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:26:59,567] [INFO] [logging.py:107:log_dist] [Rank 0] step=761, skipped=0, lr=[1e-05], mom=[0.0] +steps: 761 loss: 0.1876 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:27:10,234] [INFO] [logging.py:107:log_dist] [Rank 0] step=762, skipped=0, lr=[1e-05], mom=[0.0] +steps: 762 loss: 0.0678 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:27:20,904] [INFO] [logging.py:107:log_dist] [Rank 0] step=763, skipped=0, lr=[1e-05], mom=[0.0] +steps: 763 loss: 0.0361 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:27:31,573] [INFO] [logging.py:107:log_dist] [Rank 0] step=764, skipped=0, lr=[1e-05], mom=[0.0] +steps: 764 loss: 0.0635 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:27:42,247] [INFO] [logging.py:107:log_dist] [Rank 0] step=765, skipped=0, lr=[1e-05], mom=[0.0] +steps: 765 loss: 0.0963 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:27:52,917] [INFO] [logging.py:107:log_dist] [Rank 0] step=766, skipped=0, lr=[1e-05], mom=[0.0] +steps: 766 loss: 0.0625 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:28:03,586] [INFO] [logging.py:107:log_dist] [Rank 0] step=767, skipped=0, lr=[1e-05], mom=[0.0] +steps: 767 loss: 0.1067 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:28:14,423] [INFO] [logging.py:107:log_dist] [Rank 0] step=768, skipped=0, lr=[1e-05], mom=[0.0] +steps: 768 loss: 0.0322 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 21:28:25,097] [INFO] [logging.py:107:log_dist] [Rank 0] step=769, skipped=0, lr=[1e-05], mom=[0.0] +steps: 769 loss: 0.0378 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:28:35,765] [INFO] [logging.py:107:log_dist] [Rank 0] step=770, skipped=0, lr=[1e-05], mom=[0.0] +steps: 770 loss: 0.0554 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:28:46,443] [INFO] [logging.py:107:log_dist] [Rank 0] step=771, skipped=0, lr=[1e-05], mom=[0.0] +steps: 771 loss: 0.0334 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 21:28:57,110] [INFO] [logging.py:107:log_dist] [Rank 0] step=772, skipped=0, lr=[1e-05], mom=[0.0] +steps: 772 loss: 0.0520 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:29:07,782] [INFO] [logging.py:107:log_dist] [Rank 0] step=773, skipped=0, lr=[1e-05], mom=[0.0] +steps: 773 loss: 0.0293 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:29:18,457] [INFO] [logging.py:107:log_dist] [Rank 0] step=774, skipped=0, lr=[1e-05], mom=[0.0] +steps: 774 loss: 0.0858 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:29:29,129] [INFO] [logging.py:107:log_dist] [Rank 0] step=775, skipped=0, lr=[1e-05], mom=[0.0] +steps: 775 loss: 0.0904 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:29:39,980] [INFO] [logging.py:107:log_dist] [Rank 0] step=776, skipped=0, lr=[1e-05], mom=[0.0] +steps: 776 loss: 0.0624 iter time (s): 10.822 samples/sec: 0.092 +[2025-05-05 21:29:50,652] [INFO] [logging.py:107:log_dist] [Rank 0] step=777, skipped=0, lr=[1e-05], mom=[0.0] +steps: 777 loss: 0.0622 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:30:01,319] [INFO] [logging.py:107:log_dist] [Rank 0] step=778, skipped=0, lr=[1e-05], mom=[0.0] +steps: 778 loss: 0.0651 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:30:11,989] [INFO] [logging.py:107:log_dist] [Rank 0] step=779, skipped=0, lr=[1e-05], mom=[0.0] +steps: 779 loss: 0.0722 iter time (s): 10.642 samples/sec: 0.094 +Started new epoch: 20 +[2025-05-05 21:30:22,995] [INFO] [logging.py:107:log_dist] [Rank 0] step=780, skipped=0, lr=[1e-05], mom=[0.0] +steps: 780 loss: 0.0496 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:30:33,663] [INFO] [logging.py:107:log_dist] [Rank 0] step=781, skipped=0, lr=[1e-05], mom=[0.0] +steps: 781 loss: 0.1807 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:30:44,337] [INFO] [logging.py:107:log_dist] [Rank 0] step=782, skipped=0, lr=[1e-05], mom=[0.0] +steps: 782 loss: 0.1271 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:30:55,022] [INFO] [logging.py:107:log_dist] [Rank 0] step=783, skipped=0, lr=[1e-05], mom=[0.0] +steps: 783 loss: 0.2122 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 21:31:05,693] [INFO] [logging.py:107:log_dist] [Rank 0] step=784, skipped=0, lr=[1e-05], mom=[0.0] +steps: 784 loss: 0.0461 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:31:16,571] [INFO] [logging.py:107:log_dist] [Rank 0] step=785, skipped=0, lr=[1e-05], mom=[0.0] +steps: 785 loss: 0.0681 iter time (s): 10.843 samples/sec: 0.092 +[2025-05-05 21:31:27,245] [INFO] [logging.py:107:log_dist] [Rank 0] step=786, skipped=0, lr=[1e-05], mom=[0.0] +steps: 786 loss: 0.1035 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:31:37,918] [INFO] [logging.py:107:log_dist] [Rank 0] step=787, skipped=0, lr=[1e-05], mom=[0.0] +steps: 787 loss: 0.0532 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:31:48,589] [INFO] [logging.py:107:log_dist] [Rank 0] step=788, skipped=0, lr=[1e-05], mom=[0.0] +steps: 788 loss: 0.0771 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:31:59,262] [INFO] [logging.py:107:log_dist] [Rank 0] step=789, skipped=0, lr=[1e-05], mom=[0.0] +steps: 789 loss: 0.0427 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:32:09,932] [INFO] [logging.py:107:log_dist] [Rank 0] step=790, skipped=0, lr=[1e-05], mom=[0.0] +steps: 790 loss: 0.1009 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:32:20,606] [INFO] [logging.py:107:log_dist] [Rank 0] step=791, skipped=0, lr=[1e-05], mom=[0.0] +steps: 791 loss: 0.0432 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:32:31,278] [INFO] [logging.py:107:log_dist] [Rank 0] step=792, skipped=0, lr=[1e-05], mom=[0.0] +steps: 792 loss: 0.1345 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:32:41,947] [INFO] [logging.py:107:log_dist] [Rank 0] step=793, skipped=0, lr=[1e-05], mom=[0.0] +steps: 793 loss: 0.0854 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:32:52,780] [INFO] [logging.py:107:log_dist] [Rank 0] step=794, skipped=0, lr=[1e-05], mom=[0.0] +steps: 794 loss: 0.0270 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-05 21:33:03,456] [INFO] [logging.py:107:log_dist] [Rank 0] step=795, skipped=0, lr=[1e-05], mom=[0.0] +steps: 795 loss: 0.0386 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:33:14,123] [INFO] [logging.py:107:log_dist] [Rank 0] step=796, skipped=0, lr=[1e-05], mom=[0.0] +steps: 796 loss: 0.0400 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:33:24,801] [INFO] [logging.py:107:log_dist] [Rank 0] step=797, skipped=0, lr=[1e-05], mom=[0.0] +steps: 797 loss: 0.0971 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 21:33:35,474] [INFO] [logging.py:107:log_dist] [Rank 0] step=798, skipped=0, lr=[1e-05], mom=[0.0] +steps: 798 loss: 0.0497 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:33:46,142] [INFO] [logging.py:107:log_dist] [Rank 0] step=799, skipped=0, lr=[1e-05], mom=[0.0] +steps: 799 loss: 0.0543 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:33:56,812] [INFO] [logging.py:107:log_dist] [Rank 0] step=800, skipped=0, lr=[1e-05], mom=[0.0] +steps: 800 loss: 0.0922 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:34:07,487] [INFO] [logging.py:107:log_dist] [Rank 0] step=801, skipped=0, lr=[1e-05], mom=[0.0] +steps: 801 loss: 0.0771 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:34:18,320] [INFO] [logging.py:107:log_dist] [Rank 0] step=802, skipped=0, lr=[1e-05], mom=[0.0] +steps: 802 loss: 0.0631 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-05 21:34:28,998] [INFO] [logging.py:107:log_dist] [Rank 0] step=803, skipped=0, lr=[1e-05], mom=[0.0] +steps: 803 loss: 0.0675 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 21:34:39,669] [INFO] [logging.py:107:log_dist] [Rank 0] step=804, skipped=0, lr=[1e-05], mom=[0.0] +steps: 804 loss: 0.0311 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:34:50,339] [INFO] [logging.py:107:log_dist] [Rank 0] step=805, skipped=0, lr=[1e-05], mom=[0.0] +steps: 805 loss: 0.2422 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:35:01,014] [INFO] [logging.py:107:log_dist] [Rank 0] step=806, skipped=0, lr=[1e-05], mom=[0.0] +steps: 806 loss: 0.0921 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:35:11,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=807, skipped=0, lr=[1e-05], mom=[0.0] +steps: 807 loss: 0.0329 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:35:22,357] [INFO] [logging.py:107:log_dist] [Rank 0] step=808, skipped=0, lr=[1e-05], mom=[0.0] +steps: 808 loss: 0.0324 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:35:33,028] [INFO] [logging.py:107:log_dist] [Rank 0] step=809, skipped=0, lr=[1e-05], mom=[0.0] +steps: 809 loss: 0.0319 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:35:43,697] [INFO] [logging.py:107:log_dist] [Rank 0] step=810, skipped=0, lr=[1e-05], mom=[0.0] +steps: 810 loss: 0.1157 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:35:54,539] [INFO] [logging.py:107:log_dist] [Rank 0] step=811, skipped=0, lr=[1e-05], mom=[0.0] +steps: 811 loss: 0.1221 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 21:36:05,216] [INFO] [logging.py:107:log_dist] [Rank 0] step=812, skipped=0, lr=[1e-05], mom=[0.0] +steps: 812 loss: 0.0448 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:36:15,886] [INFO] [logging.py:107:log_dist] [Rank 0] step=813, skipped=0, lr=[1e-05], mom=[0.0] +steps: 813 loss: 0.0336 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:36:26,557] [INFO] [logging.py:107:log_dist] [Rank 0] step=814, skipped=0, lr=[1e-05], mom=[0.0] +steps: 814 loss: 0.0537 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:36:37,233] [INFO] [logging.py:107:log_dist] [Rank 0] step=815, skipped=0, lr=[1e-05], mom=[0.0] +steps: 815 loss: 0.0609 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:36:47,900] [INFO] [logging.py:107:log_dist] [Rank 0] step=816, skipped=0, lr=[1e-05], mom=[0.0] +steps: 816 loss: 0.0559 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:36:58,568] [INFO] [logging.py:107:log_dist] [Rank 0] step=817, skipped=0, lr=[1e-05], mom=[0.0] +steps: 817 loss: 0.0557 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:37:09,237] [INFO] [logging.py:107:log_dist] [Rank 0] step=818, skipped=0, lr=[1e-05], mom=[0.0] +steps: 818 loss: 0.1346 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:37:20,095] [INFO] [logging.py:107:log_dist] [Rank 0] step=819, skipped=0, lr=[1e-05], mom=[0.0] +steps: 819 loss: 0.0593 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-05 21:37:30,762] [INFO] [logging.py:107:log_dist] [Rank 0] step=820, skipped=0, lr=[1e-05], mom=[0.0] +steps: 820 loss: 0.0712 iter time (s): 10.641 samples/sec: 0.094 +Saving model to directory epoch20 +Started new epoch: 21 +[2025-05-05 21:37:43,459] [INFO] [logging.py:107:log_dist] [Rank 0] step=821, skipped=0, lr=[1e-05], mom=[0.0] +steps: 821 loss: 0.1488 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 21:37:54,125] [INFO] [logging.py:107:log_dist] [Rank 0] step=822, skipped=0, lr=[1e-05], mom=[0.0] +steps: 822 loss: 0.0881 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 21:38:04,793] [INFO] [logging.py:107:log_dist] [Rank 0] step=823, skipped=0, lr=[1e-05], mom=[0.0] +steps: 823 loss: 0.0805 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:38:15,465] [INFO] [logging.py:107:log_dist] [Rank 0] step=824, skipped=0, lr=[1e-05], mom=[0.0] +steps: 824 loss: 0.0504 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:38:26,135] [INFO] [logging.py:107:log_dist] [Rank 0] step=825, skipped=0, lr=[1e-05], mom=[0.0] +steps: 825 loss: 0.0442 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:38:36,806] [INFO] [logging.py:107:log_dist] [Rank 0] step=826, skipped=0, lr=[1e-05], mom=[0.0] +steps: 826 loss: 0.0553 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:38:47,697] [INFO] [logging.py:107:log_dist] [Rank 0] step=827, skipped=0, lr=[1e-05], mom=[0.0] +steps: 827 loss: 0.0478 iter time (s): 10.860 samples/sec: 0.092 +[2025-05-05 21:38:58,366] [INFO] [logging.py:107:log_dist] [Rank 0] step=828, skipped=0, lr=[1e-05], mom=[0.0] +steps: 828 loss: 0.0659 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:39:09,036] [INFO] [logging.py:107:log_dist] [Rank 0] step=829, skipped=0, lr=[1e-05], mom=[0.0] +steps: 829 loss: 0.2097 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:39:19,708] [INFO] [logging.py:107:log_dist] [Rank 0] step=830, skipped=0, lr=[1e-05], mom=[0.0] +steps: 830 loss: 0.1000 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:39:30,381] [INFO] [logging.py:107:log_dist] [Rank 0] step=831, skipped=0, lr=[1e-05], mom=[0.0] +steps: 831 loss: 0.1240 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:39:41,058] [INFO] [logging.py:107:log_dist] [Rank 0] step=832, skipped=0, lr=[1e-05], mom=[0.0] +steps: 832 loss: 0.0802 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 21:39:51,726] [INFO] [logging.py:107:log_dist] [Rank 0] step=833, skipped=0, lr=[1e-05], mom=[0.0] +steps: 833 loss: 0.0658 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:40:02,403] [INFO] [logging.py:107:log_dist] [Rank 0] step=834, skipped=0, lr=[1e-05], mom=[0.0] +steps: 834 loss: 0.1563 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 21:40:13,081] [INFO] [logging.py:107:log_dist] [Rank 0] step=835, skipped=0, lr=[1e-05], mom=[0.0] +steps: 835 loss: 0.2638 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 21:40:23,920] [INFO] [logging.py:107:log_dist] [Rank 0] step=836, skipped=0, lr=[1e-05], mom=[0.0] +steps: 836 loss: 0.0610 iter time (s): 10.807 samples/sec: 0.093 +[2025-05-05 21:40:34,592] [INFO] [logging.py:107:log_dist] [Rank 0] step=837, skipped=0, lr=[1e-05], mom=[0.0] +steps: 837 loss: 0.0976 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:40:45,274] [INFO] [logging.py:107:log_dist] [Rank 0] step=838, skipped=0, lr=[1e-05], mom=[0.0] +steps: 838 loss: 0.0750 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-05 21:40:55,947] [INFO] [logging.py:107:log_dist] [Rank 0] step=839, skipped=0, lr=[1e-05], mom=[0.0] +steps: 839 loss: 0.0921 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:41:06,618] [INFO] [logging.py:107:log_dist] [Rank 0] step=840, skipped=0, lr=[1e-05], mom=[0.0] +steps: 840 loss: 0.1227 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:41:17,293] [INFO] [logging.py:107:log_dist] [Rank 0] step=841, skipped=0, lr=[1e-05], mom=[0.0] +steps: 841 loss: 0.0264 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:41:27,964] [INFO] [logging.py:107:log_dist] [Rank 0] step=842, skipped=0, lr=[1e-05], mom=[0.0] +steps: 842 loss: 0.0603 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:41:38,633] [INFO] [logging.py:107:log_dist] [Rank 0] step=843, skipped=0, lr=[1e-05], mom=[0.0] +steps: 843 loss: 0.1607 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:41:49,307] [INFO] [logging.py:107:log_dist] [Rank 0] step=844, skipped=0, lr=[1e-05], mom=[0.0] +steps: 844 loss: 0.0926 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:42:00,144] [INFO] [logging.py:107:log_dist] [Rank 0] step=845, skipped=0, lr=[1e-05], mom=[0.0] +steps: 845 loss: 0.1171 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 21:42:10,814] [INFO] [logging.py:107:log_dist] [Rank 0] step=846, skipped=0, lr=[1e-05], mom=[0.0] +steps: 846 loss: 0.0373 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:42:21,492] [INFO] [logging.py:107:log_dist] [Rank 0] step=847, skipped=0, lr=[1e-05], mom=[0.0] +steps: 847 loss: 0.2016 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 21:42:32,163] [INFO] [logging.py:107:log_dist] [Rank 0] step=848, skipped=0, lr=[1e-05], mom=[0.0] +steps: 848 loss: 0.0500 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:42:42,833] [INFO] [logging.py:107:log_dist] [Rank 0] step=849, skipped=0, lr=[1e-05], mom=[0.0] +steps: 849 loss: 0.0833 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:42:53,512] [INFO] [logging.py:107:log_dist] [Rank 0] step=850, skipped=0, lr=[1e-05], mom=[0.0] +steps: 850 loss: 0.0662 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 21:43:04,193] [INFO] [logging.py:107:log_dist] [Rank 0] step=851, skipped=0, lr=[1e-05], mom=[0.0] +steps: 851 loss: 0.1074 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 21:43:14,863] [INFO] [logging.py:107:log_dist] [Rank 0] step=852, skipped=0, lr=[1e-05], mom=[0.0] +steps: 852 loss: 0.1130 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:43:25,538] [INFO] [logging.py:107:log_dist] [Rank 0] step=853, skipped=0, lr=[1e-05], mom=[0.0] +steps: 853 loss: 0.0504 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:43:36,368] [INFO] [logging.py:107:log_dist] [Rank 0] step=854, skipped=0, lr=[1e-05], mom=[0.0] +steps: 854 loss: 0.0367 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-05 21:43:47,042] [INFO] [logging.py:107:log_dist] [Rank 0] step=855, skipped=0, lr=[1e-05], mom=[0.0] +steps: 855 loss: 0.0325 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:43:57,716] [INFO] [logging.py:107:log_dist] [Rank 0] step=856, skipped=0, lr=[1e-05], mom=[0.0] +steps: 856 loss: 0.1103 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:44:08,386] [INFO] [logging.py:107:log_dist] [Rank 0] step=857, skipped=0, lr=[1e-05], mom=[0.0] +steps: 857 loss: 0.2442 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:44:19,056] [INFO] [logging.py:107:log_dist] [Rank 0] step=858, skipped=0, lr=[1e-05], mom=[0.0] +steps: 858 loss: 0.0430 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:44:29,730] [INFO] [logging.py:107:log_dist] [Rank 0] step=859, skipped=0, lr=[1e-05], mom=[0.0] +steps: 859 loss: 0.0374 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:44:40,401] [INFO] [logging.py:107:log_dist] [Rank 0] step=860, skipped=0, lr=[1e-05], mom=[0.0] +steps: 860 loss: 0.0559 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:44:51,068] [INFO] [logging.py:107:log_dist] [Rank 0] step=861, skipped=0, lr=[1e-05], mom=[0.0] +steps: 861 loss: 0.0461 iter time (s): 10.640 samples/sec: 0.094 +Started new epoch: 22 +[2025-05-05 21:45:02,269] [INFO] [logging.py:107:log_dist] [Rank 0] step=862, skipped=0, lr=[1e-05], mom=[0.0] +steps: 862 loss: 0.0573 iter time (s): 10.838 samples/sec: 0.092 +[2025-05-05 21:45:12,941] [INFO] [logging.py:107:log_dist] [Rank 0] step=863, skipped=0, lr=[1e-05], mom=[0.0] +steps: 863 loss: 0.0980 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:45:23,621] [INFO] [logging.py:107:log_dist] [Rank 0] step=864, skipped=0, lr=[1e-05], mom=[0.0] +steps: 864 loss: 0.0523 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:45:34,291] [INFO] [logging.py:107:log_dist] [Rank 0] step=865, skipped=0, lr=[1e-05], mom=[0.0] +steps: 865 loss: 0.0395 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:45:44,962] [INFO] [logging.py:107:log_dist] [Rank 0] step=866, skipped=0, lr=[1e-05], mom=[0.0] +steps: 866 loss: 0.0740 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:45:55,629] [INFO] [logging.py:107:log_dist] [Rank 0] step=867, skipped=0, lr=[1e-05], mom=[0.0] +steps: 867 loss: 0.0345 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:46:06,302] [INFO] [logging.py:107:log_dist] [Rank 0] step=868, skipped=0, lr=[1e-05], mom=[0.0] +steps: 868 loss: 0.0370 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:46:16,971] [INFO] [logging.py:107:log_dist] [Rank 0] step=869, skipped=0, lr=[1e-05], mom=[0.0] +steps: 869 loss: 0.0379 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:46:27,640] [INFO] [logging.py:107:log_dist] [Rank 0] step=870, skipped=0, lr=[1e-05], mom=[0.0] +steps: 870 loss: 0.1048 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:46:38,500] [INFO] [logging.py:107:log_dist] [Rank 0] step=871, skipped=0, lr=[1e-05], mom=[0.0] +steps: 871 loss: 0.1306 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-05 21:46:49,171] [INFO] [logging.py:107:log_dist] [Rank 0] step=872, skipped=0, lr=[1e-05], mom=[0.0] +steps: 872 loss: 0.1715 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:46:59,839] [INFO] [logging.py:107:log_dist] [Rank 0] step=873, skipped=0, lr=[1e-05], mom=[0.0] +steps: 873 loss: 0.0957 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:47:10,509] [INFO] [logging.py:107:log_dist] [Rank 0] step=874, skipped=0, lr=[1e-05], mom=[0.0] +steps: 874 loss: 0.0857 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:47:21,181] [INFO] [logging.py:107:log_dist] [Rank 0] step=875, skipped=0, lr=[1e-05], mom=[0.0] +steps: 875 loss: 0.0410 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:47:31,852] [INFO] [logging.py:107:log_dist] [Rank 0] step=876, skipped=0, lr=[1e-05], mom=[0.0] +steps: 876 loss: 0.0497 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:47:42,528] [INFO] [logging.py:107:log_dist] [Rank 0] step=877, skipped=0, lr=[1e-05], mom=[0.0] +steps: 877 loss: 0.0522 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 21:47:53,208] [INFO] [logging.py:107:log_dist] [Rank 0] step=878, skipped=0, lr=[1e-05], mom=[0.0] +steps: 878 loss: 0.0715 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:48:03,879] [INFO] [logging.py:107:log_dist] [Rank 0] step=879, skipped=0, lr=[1e-05], mom=[0.0] +steps: 879 loss: 0.0430 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:48:14,713] [INFO] [logging.py:107:log_dist] [Rank 0] step=880, skipped=0, lr=[1e-05], mom=[0.0] +steps: 880 loss: 0.1543 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-05 21:48:25,383] [INFO] [logging.py:107:log_dist] [Rank 0] step=881, skipped=0, lr=[1e-05], mom=[0.0] +steps: 881 loss: 0.0880 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:48:36,053] [INFO] [logging.py:107:log_dist] [Rank 0] step=882, skipped=0, lr=[1e-05], mom=[0.0] +steps: 882 loss: 0.0324 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:48:46,734] [INFO] [logging.py:107:log_dist] [Rank 0] step=883, skipped=0, lr=[1e-05], mom=[0.0] +steps: 883 loss: 0.0827 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 21:48:57,410] [INFO] [logging.py:107:log_dist] [Rank 0] step=884, skipped=0, lr=[1e-05], mom=[0.0] +steps: 884 loss: 0.1004 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 21:49:08,084] [INFO] [logging.py:107:log_dist] [Rank 0] step=885, skipped=0, lr=[1e-05], mom=[0.0] +steps: 885 loss: 0.0398 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:49:18,774] [INFO] [logging.py:107:log_dist] [Rank 0] step=886, skipped=0, lr=[1e-05], mom=[0.0] +steps: 886 loss: 0.0461 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:49:29,444] [INFO] [logging.py:107:log_dist] [Rank 0] step=887, skipped=0, lr=[1e-05], mom=[0.0] +steps: 887 loss: 0.0620 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:49:40,320] [INFO] [logging.py:107:log_dist] [Rank 0] step=888, skipped=0, lr=[1e-05], mom=[0.0] +steps: 888 loss: 0.0351 iter time (s): 10.842 samples/sec: 0.092 +[2025-05-05 21:49:50,992] [INFO] [logging.py:107:log_dist] [Rank 0] step=889, skipped=0, lr=[1e-05], mom=[0.0] +steps: 889 loss: 0.0508 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:50:01,670] [INFO] [logging.py:107:log_dist] [Rank 0] step=890, skipped=0, lr=[1e-05], mom=[0.0] +steps: 890 loss: 0.0286 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 21:50:12,428] [INFO] [logging.py:107:log_dist] [Rank 0] step=891, skipped=0, lr=[1e-05], mom=[0.0] +steps: 891 loss: 0.3688 iter time (s): 10.727 samples/sec: 0.093 +[2025-05-05 21:50:23,165] [INFO] [logging.py:107:log_dist] [Rank 0] step=892, skipped=0, lr=[1e-05], mom=[0.0] +steps: 892 loss: 0.2502 iter time (s): 10.694 samples/sec: 0.094 +[2025-05-05 21:50:33,854] [INFO] [logging.py:107:log_dist] [Rank 0] step=893, skipped=0, lr=[1e-05], mom=[0.0] +steps: 893 loss: 0.0588 iter time (s): 10.658 samples/sec: 0.094 +[2025-05-05 21:50:44,529] [INFO] [logging.py:107:log_dist] [Rank 0] step=894, skipped=0, lr=[1e-05], mom=[0.0] +steps: 894 loss: 0.0878 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:50:55,212] [INFO] [logging.py:107:log_dist] [Rank 0] step=895, skipped=0, lr=[1e-05], mom=[0.0] +steps: 895 loss: 0.0504 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:51:05,886] [INFO] [logging.py:107:log_dist] [Rank 0] step=896, skipped=0, lr=[1e-05], mom=[0.0] +steps: 896 loss: 0.0913 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:51:16,752] [INFO] [logging.py:107:log_dist] [Rank 0] step=897, skipped=0, lr=[1e-05], mom=[0.0] +steps: 897 loss: 0.0355 iter time (s): 10.835 samples/sec: 0.092 +[2025-05-05 21:51:27,419] [INFO] [logging.py:107:log_dist] [Rank 0] step=898, skipped=0, lr=[1e-05], mom=[0.0] +steps: 898 loss: 0.0634 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 21:51:38,090] [INFO] [logging.py:107:log_dist] [Rank 0] step=899, skipped=0, lr=[1e-05], mom=[0.0] +steps: 899 loss: 0.0533 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:51:48,763] [INFO] [logging.py:107:log_dist] [Rank 0] step=900, skipped=0, lr=[1e-05], mom=[0.0] +steps: 900 loss: 0.2317 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:51:59,434] [INFO] [logging.py:107:log_dist] [Rank 0] step=901, skipped=0, lr=[1e-05], mom=[0.0] +steps: 901 loss: 0.0601 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:52:10,099] [INFO] [logging.py:107:log_dist] [Rank 0] step=902, skipped=0, lr=[1e-05], mom=[0.0] +steps: 902 loss: 0.2346 iter time (s): 10.639 samples/sec: 0.094 +Started new epoch: 23 +[2025-05-05 21:52:21,114] [INFO] [logging.py:107:log_dist] [Rank 0] step=903, skipped=0, lr=[1e-05], mom=[0.0] +steps: 903 loss: 0.1148 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:52:31,784] [INFO] [logging.py:107:log_dist] [Rank 0] step=904, skipped=0, lr=[1e-05], mom=[0.0] +steps: 904 loss: 0.3782 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:52:42,451] [INFO] [logging.py:107:log_dist] [Rank 0] step=905, skipped=0, lr=[1e-05], mom=[0.0] +steps: 905 loss: 0.1090 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:52:53,281] [INFO] [logging.py:107:log_dist] [Rank 0] step=906, skipped=0, lr=[1e-05], mom=[0.0] +steps: 906 loss: 0.0455 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-05 21:53:03,947] [INFO] [logging.py:107:log_dist] [Rank 0] step=907, skipped=0, lr=[1e-05], mom=[0.0] +steps: 907 loss: 0.0625 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-05 21:53:14,616] [INFO] [logging.py:107:log_dist] [Rank 0] step=908, skipped=0, lr=[1e-05], mom=[0.0] +steps: 908 loss: 0.0346 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:53:25,287] [INFO] [logging.py:107:log_dist] [Rank 0] step=909, skipped=0, lr=[1e-05], mom=[0.0] +steps: 909 loss: 0.0392 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:53:35,959] [INFO] [logging.py:107:log_dist] [Rank 0] step=910, skipped=0, lr=[1e-05], mom=[0.0] +steps: 910 loss: 0.0536 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:53:46,631] [INFO] [logging.py:107:log_dist] [Rank 0] step=911, skipped=0, lr=[1e-05], mom=[0.0] +steps: 911 loss: 0.0445 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:53:57,300] [INFO] [logging.py:107:log_dist] [Rank 0] step=912, skipped=0, lr=[1e-05], mom=[0.0] +steps: 912 loss: 0.0876 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 21:54:07,975] [INFO] [logging.py:107:log_dist] [Rank 0] step=913, skipped=0, lr=[1e-05], mom=[0.0] +steps: 913 loss: 0.0874 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:54:18,807] [INFO] [logging.py:107:log_dist] [Rank 0] step=914, skipped=0, lr=[1e-05], mom=[0.0] +steps: 914 loss: 0.0387 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-05 21:54:29,481] [INFO] [logging.py:107:log_dist] [Rank 0] step=915, skipped=0, lr=[1e-05], mom=[0.0] +steps: 915 loss: 0.0636 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:54:40,156] [INFO] [logging.py:107:log_dist] [Rank 0] step=916, skipped=0, lr=[1e-05], mom=[0.0] +steps: 916 loss: 0.0529 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 21:54:50,824] [INFO] [logging.py:107:log_dist] [Rank 0] step=917, skipped=0, lr=[1e-05], mom=[0.0] +steps: 917 loss: 0.0234 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 21:55:01,497] [INFO] [logging.py:107:log_dist] [Rank 0] step=918, skipped=0, lr=[1e-05], mom=[0.0] +steps: 918 loss: 0.0495 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:55:12,171] [INFO] [logging.py:107:log_dist] [Rank 0] step=919, skipped=0, lr=[1e-05], mom=[0.0] +steps: 919 loss: 0.2407 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:55:22,839] [INFO] [logging.py:107:log_dist] [Rank 0] step=920, skipped=0, lr=[1e-05], mom=[0.0] +steps: 920 loss: 0.0810 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:55:33,560] [INFO] [logging.py:107:log_dist] [Rank 0] step=921, skipped=0, lr=[1e-05], mom=[0.0] +steps: 921 loss: 0.1564 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:55:44,227] [INFO] [logging.py:107:log_dist] [Rank 0] step=922, skipped=0, lr=[1e-05], mom=[0.0] +steps: 922 loss: 0.1327 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 21:55:55,060] [INFO] [logging.py:107:log_dist] [Rank 0] step=923, skipped=0, lr=[1e-05], mom=[0.0] +steps: 923 loss: 0.0892 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-05 21:56:05,735] [INFO] [logging.py:107:log_dist] [Rank 0] step=924, skipped=0, lr=[1e-05], mom=[0.0] +steps: 924 loss: 0.1761 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 21:56:16,400] [INFO] [logging.py:107:log_dist] [Rank 0] step=925, skipped=0, lr=[1e-05], mom=[0.0] +steps: 925 loss: 0.1569 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 21:56:27,071] [INFO] [logging.py:107:log_dist] [Rank 0] step=926, skipped=0, lr=[1e-05], mom=[0.0] +steps: 926 loss: 0.0840 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:56:37,747] [INFO] [logging.py:107:log_dist] [Rank 0] step=927, skipped=0, lr=[1e-05], mom=[0.0] +steps: 927 loss: 0.2116 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:56:48,416] [INFO] [logging.py:107:log_dist] [Rank 0] step=928, skipped=0, lr=[1e-05], mom=[0.0] +steps: 928 loss: 0.0341 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:56:59,085] [INFO] [logging.py:107:log_dist] [Rank 0] step=929, skipped=0, lr=[1e-05], mom=[0.0] +steps: 929 loss: 0.1125 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:57:09,760] [INFO] [logging.py:107:log_dist] [Rank 0] step=930, skipped=0, lr=[1e-05], mom=[0.0] +steps: 930 loss: 0.0325 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:57:20,619] [INFO] [logging.py:107:log_dist] [Rank 0] step=931, skipped=0, lr=[1e-05], mom=[0.0] +steps: 931 loss: 0.0378 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-05 21:57:31,299] [INFO] [logging.py:107:log_dist] [Rank 0] step=932, skipped=0, lr=[1e-05], mom=[0.0] +steps: 932 loss: 0.0660 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 21:57:41,978] [INFO] [logging.py:107:log_dist] [Rank 0] step=933, skipped=0, lr=[1e-05], mom=[0.0] +steps: 933 loss: 0.0736 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:57:52,650] [INFO] [logging.py:107:log_dist] [Rank 0] step=934, skipped=0, lr=[1e-05], mom=[0.0] +steps: 934 loss: 0.0637 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:58:03,319] [INFO] [logging.py:107:log_dist] [Rank 0] step=935, skipped=0, lr=[1e-05], mom=[0.0] +steps: 935 loss: 0.0361 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:58:13,995] [INFO] [logging.py:107:log_dist] [Rank 0] step=936, skipped=0, lr=[1e-05], mom=[0.0] +steps: 936 loss: 0.1929 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 21:58:24,664] [INFO] [logging.py:107:log_dist] [Rank 0] step=937, skipped=0, lr=[1e-05], mom=[0.0] +steps: 937 loss: 0.0278 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 21:58:35,334] [INFO] [logging.py:107:log_dist] [Rank 0] step=938, skipped=0, lr=[1e-05], mom=[0.0] +steps: 938 loss: 0.0521 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 21:58:46,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=939, skipped=0, lr=[1e-05], mom=[0.0] +steps: 939 loss: 0.0942 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 21:58:56,864] [INFO] [logging.py:107:log_dist] [Rank 0] step=940, skipped=0, lr=[1e-05], mom=[0.0] +steps: 940 loss: 0.0784 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-05 21:59:07,541] [INFO] [logging.py:107:log_dist] [Rank 0] step=941, skipped=0, lr=[1e-05], mom=[0.0] +steps: 941 loss: 0.1436 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 21:59:18,215] [INFO] [logging.py:107:log_dist] [Rank 0] step=942, skipped=0, lr=[1e-05], mom=[0.0] +steps: 942 loss: 0.1317 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 21:59:28,876] [INFO] [logging.py:107:log_dist] [Rank 0] step=943, skipped=0, lr=[1e-05], mom=[0.0] +steps: 943 loss: 0.2352 iter time (s): 10.635 samples/sec: 0.094 +Started new epoch: 24 +[2025-05-05 21:59:39,896] [INFO] [logging.py:107:log_dist] [Rank 0] step=944, skipped=0, lr=[1e-05], mom=[0.0] +steps: 944 loss: 0.0659 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 21:59:50,566] [INFO] [logging.py:107:log_dist] [Rank 0] step=945, skipped=0, lr=[1e-05], mom=[0.0] +steps: 945 loss: 0.0809 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:00:01,236] [INFO] [logging.py:107:log_dist] [Rank 0] step=946, skipped=0, lr=[1e-05], mom=[0.0] +steps: 946 loss: 0.2327 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:00:11,924] [INFO] [logging.py:107:log_dist] [Rank 0] step=947, skipped=0, lr=[1e-05], mom=[0.0] +steps: 947 loss: 0.0611 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 22:00:22,596] [INFO] [logging.py:107:log_dist] [Rank 0] step=948, skipped=0, lr=[1e-05], mom=[0.0] +steps: 948 loss: 0.0329 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:00:33,431] [INFO] [logging.py:107:log_dist] [Rank 0] step=949, skipped=0, lr=[1e-05], mom=[0.0] +steps: 949 loss: 0.0512 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-05 22:00:44,102] [INFO] [logging.py:107:log_dist] [Rank 0] step=950, skipped=0, lr=[1e-05], mom=[0.0] +steps: 950 loss: 0.0795 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:00:54,777] [INFO] [logging.py:107:log_dist] [Rank 0] step=951, skipped=0, lr=[1e-05], mom=[0.0] +steps: 951 loss: 0.0460 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:01:05,446] [INFO] [logging.py:107:log_dist] [Rank 0] step=952, skipped=0, lr=[1e-05], mom=[0.0] +steps: 952 loss: 0.1330 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:01:16,119] [INFO] [logging.py:107:log_dist] [Rank 0] step=953, skipped=0, lr=[1e-05], mom=[0.0] +steps: 953 loss: 0.0713 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:01:26,794] [INFO] [logging.py:107:log_dist] [Rank 0] step=954, skipped=0, lr=[1e-05], mom=[0.0] +steps: 954 loss: 0.0738 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:01:37,461] [INFO] [logging.py:107:log_dist] [Rank 0] step=955, skipped=0, lr=[1e-05], mom=[0.0] +steps: 955 loss: 0.0515 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 22:01:48,138] [INFO] [logging.py:107:log_dist] [Rank 0] step=956, skipped=0, lr=[1e-05], mom=[0.0] +steps: 956 loss: 0.0674 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:01:58,999] [INFO] [logging.py:107:log_dist] [Rank 0] step=957, skipped=0, lr=[1e-05], mom=[0.0] +steps: 957 loss: 0.0453 iter time (s): 10.831 samples/sec: 0.092 +[2025-05-05 22:02:09,666] [INFO] [logging.py:107:log_dist] [Rank 0] step=958, skipped=0, lr=[1e-05], mom=[0.0] +steps: 958 loss: 0.2219 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 22:02:20,337] [INFO] [logging.py:107:log_dist] [Rank 0] step=959, skipped=0, lr=[1e-05], mom=[0.0] +steps: 959 loss: 0.1263 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:02:31,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=960, skipped=0, lr=[1e-05], mom=[0.0] +steps: 960 loss: 0.0695 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:02:41,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=961, skipped=0, lr=[1e-05], mom=[0.0] +steps: 961 loss: 0.0791 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:02:52,348] [INFO] [logging.py:107:log_dist] [Rank 0] step=962, skipped=0, lr=[1e-05], mom=[0.0] +steps: 962 loss: 0.2603 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:03:03,016] [INFO] [logging.py:107:log_dist] [Rank 0] step=963, skipped=0, lr=[1e-05], mom=[0.0] +steps: 963 loss: 0.0601 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:03:13,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=964, skipped=0, lr=[1e-05], mom=[0.0] +steps: 964 loss: 0.0302 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:03:24,362] [INFO] [logging.py:107:log_dist] [Rank 0] step=965, skipped=0, lr=[1e-05], mom=[0.0] +steps: 965 loss: 0.0540 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:03:35,223] [INFO] [logging.py:107:log_dist] [Rank 0] step=966, skipped=0, lr=[1e-05], mom=[0.0] +steps: 966 loss: 0.0377 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-05 22:03:45,893] [INFO] [logging.py:107:log_dist] [Rank 0] step=967, skipped=0, lr=[1e-05], mom=[0.0] +steps: 967 loss: 0.0835 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:03:56,570] [INFO] [logging.py:107:log_dist] [Rank 0] step=968, skipped=0, lr=[1e-05], mom=[0.0] +steps: 968 loss: 0.0589 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:04:07,243] [INFO] [logging.py:107:log_dist] [Rank 0] step=969, skipped=0, lr=[1e-05], mom=[0.0] +steps: 969 loss: 0.1597 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:04:17,914] [INFO] [logging.py:107:log_dist] [Rank 0] step=970, skipped=0, lr=[1e-05], mom=[0.0] +steps: 970 loss: 0.0555 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:04:28,591] [INFO] [logging.py:107:log_dist] [Rank 0] step=971, skipped=0, lr=[1e-05], mom=[0.0] +steps: 971 loss: 0.0358 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:04:39,259] [INFO] [logging.py:107:log_dist] [Rank 0] step=972, skipped=0, lr=[1e-05], mom=[0.0] +steps: 972 loss: 0.0468 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:04:49,928] [INFO] [logging.py:107:log_dist] [Rank 0] step=973, skipped=0, lr=[1e-05], mom=[0.0] +steps: 973 loss: 0.0858 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:05:00,601] [INFO] [logging.py:107:log_dist] [Rank 0] step=974, skipped=0, lr=[1e-05], mom=[0.0] +steps: 974 loss: 0.1344 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:05:11,430] [INFO] [logging.py:107:log_dist] [Rank 0] step=975, skipped=0, lr=[1e-05], mom=[0.0] +steps: 975 loss: 0.0524 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-05 22:05:22,101] [INFO] [logging.py:107:log_dist] [Rank 0] step=976, skipped=0, lr=[1e-05], mom=[0.0] +steps: 976 loss: 0.0633 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:05:32,778] [INFO] [logging.py:107:log_dist] [Rank 0] step=977, skipped=0, lr=[1e-05], mom=[0.0] +steps: 977 loss: 0.0881 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:05:43,450] [INFO] [logging.py:107:log_dist] [Rank 0] step=978, skipped=0, lr=[1e-05], mom=[0.0] +steps: 978 loss: 0.0367 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:05:54,126] [INFO] [logging.py:107:log_dist] [Rank 0] step=979, skipped=0, lr=[1e-05], mom=[0.0] +steps: 979 loss: 0.0935 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:06:04,795] [INFO] [logging.py:107:log_dist] [Rank 0] step=980, skipped=0, lr=[1e-05], mom=[0.0] +steps: 980 loss: 0.0985 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:06:15,464] [INFO] [logging.py:107:log_dist] [Rank 0] step=981, skipped=0, lr=[1e-05], mom=[0.0] +steps: 981 loss: 0.0321 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:06:26,132] [INFO] [logging.py:107:log_dist] [Rank 0] step=982, skipped=0, lr=[1e-05], mom=[0.0] +steps: 982 loss: 0.1496 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:06:36,967] [INFO] [logging.py:107:log_dist] [Rank 0] step=983, skipped=0, lr=[1e-05], mom=[0.0] +steps: 983 loss: 0.0922 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 22:06:47,631] [INFO] [logging.py:107:log_dist] [Rank 0] step=984, skipped=0, lr=[1e-05], mom=[0.0] +steps: 984 loss: 0.0559 iter time (s): 10.637 samples/sec: 0.094 +Started new epoch: 25 +[2025-05-05 22:06:58,649] [INFO] [logging.py:107:log_dist] [Rank 0] step=985, skipped=0, lr=[1e-05], mom=[0.0] +steps: 985 loss: 0.2344 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:07:09,327] [INFO] [logging.py:107:log_dist] [Rank 0] step=986, skipped=0, lr=[1e-05], mom=[0.0] +steps: 986 loss: 0.1231 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:07:19,997] [INFO] [logging.py:107:log_dist] [Rank 0] step=987, skipped=0, lr=[1e-05], mom=[0.0] +steps: 987 loss: 0.1320 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:07:30,665] [INFO] [logging.py:107:log_dist] [Rank 0] step=988, skipped=0, lr=[1e-05], mom=[0.0] +steps: 988 loss: 0.0894 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:07:41,338] [INFO] [logging.py:107:log_dist] [Rank 0] step=989, skipped=0, lr=[1e-05], mom=[0.0] +steps: 989 loss: 0.0606 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:07:52,008] [INFO] [logging.py:107:log_dist] [Rank 0] step=990, skipped=0, lr=[1e-05], mom=[0.0] +steps: 990 loss: 0.0542 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:08:02,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=991, skipped=0, lr=[1e-05], mom=[0.0] +steps: 991 loss: 0.0367 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 22:08:13,519] [INFO] [logging.py:107:log_dist] [Rank 0] step=992, skipped=0, lr=[1e-05], mom=[0.0] +steps: 992 loss: 0.0474 iter time (s): 10.812 samples/sec: 0.092 +[2025-05-05 22:08:24,190] [INFO] [logging.py:107:log_dist] [Rank 0] step=993, skipped=0, lr=[1e-05], mom=[0.0] +steps: 993 loss: 0.0364 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:08:34,856] [INFO] [logging.py:107:log_dist] [Rank 0] step=994, skipped=0, lr=[1e-05], mom=[0.0] +steps: 994 loss: 0.0698 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 22:08:45,527] [INFO] [logging.py:107:log_dist] [Rank 0] step=995, skipped=0, lr=[1e-05], mom=[0.0] +steps: 995 loss: 0.0564 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:08:56,195] [INFO] [logging.py:107:log_dist] [Rank 0] step=996, skipped=0, lr=[1e-05], mom=[0.0] +steps: 996 loss: 0.0864 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:09:06,871] [INFO] [logging.py:107:log_dist] [Rank 0] step=997, skipped=0, lr=[1e-05], mom=[0.0] +steps: 997 loss: 0.0676 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 22:09:17,551] [INFO] [logging.py:107:log_dist] [Rank 0] step=998, skipped=0, lr=[1e-05], mom=[0.0] +steps: 998 loss: 0.0402 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 22:09:28,223] [INFO] [logging.py:107:log_dist] [Rank 0] step=999, skipped=0, lr=[1e-05], mom=[0.0] +steps: 999 loss: 0.0813 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:09:39,090] [INFO] [logging.py:107:log_dist] [Rank 0] step=1000, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1000 loss: 0.0888 iter time (s): 10.835 samples/sec: 0.092 +[2025-05-05 22:09:49,765] [INFO] [logging.py:107:log_dist] [Rank 0] step=1001, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1001 loss: 0.0954 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:10:00,433] [INFO] [logging.py:107:log_dist] [Rank 0] step=1002, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1002 loss: 0.0828 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:10:11,111] [INFO] [logging.py:107:log_dist] [Rank 0] step=1003, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1003 loss: 0.0355 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 22:10:21,786] [INFO] [logging.py:107:log_dist] [Rank 0] step=1004, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1004 loss: 0.0921 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:10:32,454] [INFO] [logging.py:107:log_dist] [Rank 0] step=1005, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1005 loss: 0.0398 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 22:10:43,128] [INFO] [logging.py:107:log_dist] [Rank 0] step=1006, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1006 loss: 0.1739 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:10:53,802] [INFO] [logging.py:107:log_dist] [Rank 0] step=1007, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1007 loss: 0.0743 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:11:04,488] [INFO] [logging.py:107:log_dist] [Rank 0] step=1008, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1008 loss: 0.0312 iter time (s): 10.656 samples/sec: 0.094 +[2025-05-05 22:11:15,363] [INFO] [logging.py:107:log_dist] [Rank 0] step=1009, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1009 loss: 0.3217 iter time (s): 10.838 samples/sec: 0.092 +[2025-05-05 22:11:26,035] [INFO] [logging.py:107:log_dist] [Rank 0] step=1010, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1010 loss: 0.3664 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:11:36,711] [INFO] [logging.py:107:log_dist] [Rank 0] step=1011, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1011 loss: 0.0558 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 22:11:47,388] [INFO] [logging.py:107:log_dist] [Rank 0] step=1012, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1012 loss: 0.0530 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:11:58,076] [INFO] [logging.py:107:log_dist] [Rank 0] step=1013, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1013 loss: 0.1256 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 22:12:08,749] [INFO] [logging.py:107:log_dist] [Rank 0] step=1014, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1014 loss: 0.0673 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:12:19,419] [INFO] [logging.py:107:log_dist] [Rank 0] step=1015, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1015 loss: 0.0440 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:12:30,094] [INFO] [logging.py:107:log_dist] [Rank 0] step=1016, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1016 loss: 0.0561 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:12:40,766] [INFO] [logging.py:107:log_dist] [Rank 0] step=1017, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1017 loss: 0.0389 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:12:51,624] [INFO] [logging.py:107:log_dist] [Rank 0] step=1018, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1018 loss: 0.0463 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-05 22:13:02,295] [INFO] [logging.py:107:log_dist] [Rank 0] step=1019, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1019 loss: 0.0231 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:13:12,963] [INFO] [logging.py:107:log_dist] [Rank 0] step=1020, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1020 loss: 0.0990 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 22:13:23,635] [INFO] [logging.py:107:log_dist] [Rank 0] step=1021, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1021 loss: 0.0881 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:13:34,303] [INFO] [logging.py:107:log_dist] [Rank 0] step=1022, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1022 loss: 0.0575 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:13:44,973] [INFO] [logging.py:107:log_dist] [Rank 0] step=1023, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1023 loss: 0.0393 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:13:55,649] [INFO] [logging.py:107:log_dist] [Rank 0] step=1024, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1024 loss: 0.0315 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:14:06,320] [INFO] [logging.py:107:log_dist] [Rank 0] step=1025, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1025 loss: 0.1402 iter time (s): 10.644 samples/sec: 0.094 +Started new epoch: 26 +[2025-05-05 22:14:17,500] [INFO] [logging.py:107:log_dist] [Rank 0] step=1026, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1026 loss: 0.1293 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-05 22:14:28,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=1027, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1027 loss: 0.0419 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:14:38,847] [INFO] [logging.py:107:log_dist] [Rank 0] step=1028, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1028 loss: 0.0374 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:14:49,519] [INFO] [logging.py:107:log_dist] [Rank 0] step=1029, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1029 loss: 0.0917 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:15:00,197] [INFO] [logging.py:107:log_dist] [Rank 0] step=1030, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1030 loss: 0.1920 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 22:15:10,870] [INFO] [logging.py:107:log_dist] [Rank 0] step=1031, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1031 loss: 0.0529 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:15:21,543] [INFO] [logging.py:107:log_dist] [Rank 0] step=1032, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1032 loss: 0.0771 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:15:32,222] [INFO] [logging.py:107:log_dist] [Rank 0] step=1033, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1033 loss: 0.0494 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 22:15:42,890] [INFO] [logging.py:107:log_dist] [Rank 0] step=1034, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1034 loss: 0.3032 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:15:53,750] [INFO] [logging.py:107:log_dist] [Rank 0] step=1035, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1035 loss: 0.1188 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-05 22:16:04,421] [INFO] [logging.py:107:log_dist] [Rank 0] step=1036, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1036 loss: 0.0609 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:16:15,090] [INFO] [logging.py:107:log_dist] [Rank 0] step=1037, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1037 loss: 0.0643 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:16:25,764] [INFO] [logging.py:107:log_dist] [Rank 0] step=1038, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1038 loss: 0.0383 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:16:36,441] [INFO] [logging.py:107:log_dist] [Rank 0] step=1039, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1039 loss: 0.0352 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:16:47,112] [INFO] [logging.py:107:log_dist] [Rank 0] step=1040, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1040 loss: 0.0309 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:16:57,784] [INFO] [logging.py:107:log_dist] [Rank 0] step=1041, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1041 loss: 0.0432 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:17:08,461] [INFO] [logging.py:107:log_dist] [Rank 0] step=1042, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1042 loss: 0.0332 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:17:19,299] [INFO] [logging.py:107:log_dist] [Rank 0] step=1043, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1043 loss: 0.1531 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-05 22:17:29,971] [INFO] [logging.py:107:log_dist] [Rank 0] step=1044, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1044 loss: 0.0276 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:17:40,641] [INFO] [logging.py:107:log_dist] [Rank 0] step=1045, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1045 loss: 0.0296 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:17:51,314] [INFO] [logging.py:107:log_dist] [Rank 0] step=1046, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1046 loss: 0.0717 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:18:01,985] [INFO] [logging.py:107:log_dist] [Rank 0] step=1047, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1047 loss: 0.0478 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:18:12,663] [INFO] [logging.py:107:log_dist] [Rank 0] step=1048, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1048 loss: 0.2100 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 22:18:23,331] [INFO] [logging.py:107:log_dist] [Rank 0] step=1049, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1049 loss: 0.0416 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:18:34,000] [INFO] [logging.py:107:log_dist] [Rank 0] step=1050, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1050 loss: 0.0481 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:18:44,672] [INFO] [logging.py:107:log_dist] [Rank 0] step=1051, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1051 loss: 0.0406 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:18:55,511] [INFO] [logging.py:107:log_dist] [Rank 0] step=1052, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1052 loss: 0.0382 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-05 22:19:06,190] [INFO] [logging.py:107:log_dist] [Rank 0] step=1053, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1053 loss: 0.0278 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 22:19:16,859] [INFO] [logging.py:107:log_dist] [Rank 0] step=1054, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1054 loss: 0.0324 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:19:27,531] [INFO] [logging.py:107:log_dist] [Rank 0] step=1055, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1055 loss: 0.0717 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:19:38,203] [INFO] [logging.py:107:log_dist] [Rank 0] step=1056, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1056 loss: 0.0535 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:19:48,875] [INFO] [logging.py:107:log_dist] [Rank 0] step=1057, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1057 loss: 0.0588 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:19:59,545] [INFO] [logging.py:107:log_dist] [Rank 0] step=1058, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1058 loss: 0.1650 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:20:10,229] [INFO] [logging.py:107:log_dist] [Rank 0] step=1059, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1059 loss: 0.0610 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-05 22:20:20,903] [INFO] [logging.py:107:log_dist] [Rank 0] step=1060, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1060 loss: 0.0345 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:20:31,739] [INFO] [logging.py:107:log_dist] [Rank 0] step=1061, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1061 loss: 0.1652 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-05 22:20:42,420] [INFO] [logging.py:107:log_dist] [Rank 0] step=1062, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1062 loss: 0.0385 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 22:20:53,090] [INFO] [logging.py:107:log_dist] [Rank 0] step=1063, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1063 loss: 0.0633 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:21:03,757] [INFO] [logging.py:107:log_dist] [Rank 0] step=1064, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1064 loss: 0.0784 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 22:21:14,434] [INFO] [logging.py:107:log_dist] [Rank 0] step=1065, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1065 loss: 0.0466 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:21:25,102] [INFO] [logging.py:107:log_dist] [Rank 0] step=1066, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1066 loss: 0.0551 iter time (s): 10.641 samples/sec: 0.094 +Started new epoch: 27 +[2025-05-05 22:21:36,110] [INFO] [logging.py:107:log_dist] [Rank 0] step=1067, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1067 loss: 0.0939 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:21:46,788] [INFO] [logging.py:107:log_dist] [Rank 0] step=1068, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1068 loss: 0.0516 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 22:21:57,650] [INFO] [logging.py:107:log_dist] [Rank 0] step=1069, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1069 loss: 0.1043 iter time (s): 10.831 samples/sec: 0.092 +[2025-05-05 22:22:08,324] [INFO] [logging.py:107:log_dist] [Rank 0] step=1070, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1070 loss: 0.0312 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:22:19,003] [INFO] [logging.py:107:log_dist] [Rank 0] step=1071, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1071 loss: 0.0642 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 22:22:29,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=1072, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1072 loss: 0.0674 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:22:40,347] [INFO] [logging.py:107:log_dist] [Rank 0] step=1073, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1073 loss: 0.0956 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:22:51,020] [INFO] [logging.py:107:log_dist] [Rank 0] step=1074, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1074 loss: 0.0669 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:23:01,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=1075, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1075 loss: 0.0636 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 22:23:12,359] [INFO] [logging.py:107:log_dist] [Rank 0] step=1076, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1076 loss: 0.0603 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:23:23,036] [INFO] [logging.py:107:log_dist] [Rank 0] step=1077, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1077 loss: 0.1335 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 22:23:33,904] [INFO] [logging.py:107:log_dist] [Rank 0] step=1078, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1078 loss: 0.0523 iter time (s): 10.837 samples/sec: 0.092 +[2025-05-05 22:23:44,576] [INFO] [logging.py:107:log_dist] [Rank 0] step=1079, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1079 loss: 0.1654 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:23:55,251] [INFO] [logging.py:107:log_dist] [Rank 0] step=1080, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1080 loss: 0.1515 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:24:05,922] [INFO] [logging.py:107:log_dist] [Rank 0] step=1081, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1081 loss: 0.0244 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:24:16,591] [INFO] [logging.py:107:log_dist] [Rank 0] step=1082, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1082 loss: 0.0489 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:24:27,264] [INFO] [logging.py:107:log_dist] [Rank 0] step=1083, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1083 loss: 0.0311 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:24:37,936] [INFO] [logging.py:107:log_dist] [Rank 0] step=1084, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1084 loss: 0.0465 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:24:48,607] [INFO] [logging.py:107:log_dist] [Rank 0] step=1085, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1085 loss: 0.0305 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:24:59,279] [INFO] [logging.py:107:log_dist] [Rank 0] step=1086, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1086 loss: 0.0628 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:25:10,116] [INFO] [logging.py:107:log_dist] [Rank 0] step=1087, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1087 loss: 0.0408 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-05 22:25:20,787] [INFO] [logging.py:107:log_dist] [Rank 0] step=1088, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1088 loss: 0.0386 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:25:31,462] [INFO] [logging.py:107:log_dist] [Rank 0] step=1089, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1089 loss: 0.1762 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:25:42,130] [INFO] [logging.py:107:log_dist] [Rank 0] step=1090, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1090 loss: 0.0512 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:25:52,804] [INFO] [logging.py:107:log_dist] [Rank 0] step=1091, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1091 loss: 0.2018 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:26:03,493] [INFO] [logging.py:107:log_dist] [Rank 0] step=1092, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1092 loss: 0.0769 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:26:14,162] [INFO] [logging.py:107:log_dist] [Rank 0] step=1093, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1093 loss: 0.0841 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:26:24,832] [INFO] [logging.py:107:log_dist] [Rank 0] step=1094, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1094 loss: 0.2386 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:26:35,707] [INFO] [logging.py:107:log_dist] [Rank 0] step=1095, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1095 loss: 0.0483 iter time (s): 10.844 samples/sec: 0.092 +[2025-05-05 22:26:46,378] [INFO] [logging.py:107:log_dist] [Rank 0] step=1096, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1096 loss: 0.1425 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:26:57,050] [INFO] [logging.py:107:log_dist] [Rank 0] step=1097, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1097 loss: 0.1025 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:27:07,725] [INFO] [logging.py:107:log_dist] [Rank 0] step=1098, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1098 loss: 0.0467 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:27:18,414] [INFO] [logging.py:107:log_dist] [Rank 0] step=1099, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1099 loss: 0.0787 iter time (s): 10.657 samples/sec: 0.094 +[2025-05-05 22:27:29,090] [INFO] [logging.py:107:log_dist] [Rank 0] step=1100, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1100 loss: 0.0706 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 22:27:39,763] [INFO] [logging.py:107:log_dist] [Rank 0] step=1101, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1101 loss: 0.1392 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:27:50,435] [INFO] [logging.py:107:log_dist] [Rank 0] step=1102, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1102 loss: 0.0673 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:28:01,108] [INFO] [logging.py:107:log_dist] [Rank 0] step=1103, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1103 loss: 0.0413 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:28:12,002] [INFO] [logging.py:107:log_dist] [Rank 0] step=1104, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1104 loss: 0.3314 iter time (s): 10.864 samples/sec: 0.092 +[2025-05-05 22:28:22,673] [INFO] [logging.py:107:log_dist] [Rank 0] step=1105, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1105 loss: 0.0504 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:28:33,342] [INFO] [logging.py:107:log_dist] [Rank 0] step=1106, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1106 loss: 0.0696 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:28:44,014] [INFO] [logging.py:107:log_dist] [Rank 0] step=1107, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1107 loss: 0.0441 iter time (s): 10.645 samples/sec: 0.094 +Started new epoch: 28 +[2025-05-05 22:28:55,022] [INFO] [logging.py:107:log_dist] [Rank 0] step=1108, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1108 loss: 0.0383 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:29:05,700] [INFO] [logging.py:107:log_dist] [Rank 0] step=1109, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1109 loss: 0.0485 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:29:16,379] [INFO] [logging.py:107:log_dist] [Rank 0] step=1110, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1110 loss: 0.0518 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:29:27,058] [INFO] [logging.py:107:log_dist] [Rank 0] step=1111, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1111 loss: 0.2175 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 22:29:37,732] [INFO] [logging.py:107:log_dist] [Rank 0] step=1112, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1112 loss: 0.0898 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:29:48,561] [INFO] [logging.py:107:log_dist] [Rank 0] step=1113, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1113 loss: 0.0735 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-05 22:29:59,231] [INFO] [logging.py:107:log_dist] [Rank 0] step=1114, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1114 loss: 0.2774 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:30:09,914] [INFO] [logging.py:107:log_dist] [Rank 0] step=1115, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1115 loss: 0.1636 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-05 22:30:20,592] [INFO] [logging.py:107:log_dist] [Rank 0] step=1116, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1116 loss: 0.0791 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 22:30:31,262] [INFO] [logging.py:107:log_dist] [Rank 0] step=1117, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1117 loss: 0.0349 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:30:41,935] [INFO] [logging.py:107:log_dist] [Rank 0] step=1118, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1118 loss: 0.0579 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:30:52,610] [INFO] [logging.py:107:log_dist] [Rank 0] step=1119, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1119 loss: 0.0561 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:31:03,282] [INFO] [logging.py:107:log_dist] [Rank 0] step=1120, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1120 loss: 0.0327 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:31:14,119] [INFO] [logging.py:107:log_dist] [Rank 0] step=1121, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1121 loss: 0.1064 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 22:31:24,792] [INFO] [logging.py:107:log_dist] [Rank 0] step=1122, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1122 loss: 0.0442 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:31:35,462] [INFO] [logging.py:107:log_dist] [Rank 0] step=1123, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1123 loss: 0.0894 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:31:46,135] [INFO] [logging.py:107:log_dist] [Rank 0] step=1124, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1124 loss: 0.1087 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:31:56,807] [INFO] [logging.py:107:log_dist] [Rank 0] step=1125, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1125 loss: 0.0810 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:32:07,477] [INFO] [logging.py:107:log_dist] [Rank 0] step=1126, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1126 loss: 0.0632 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:32:18,155] [INFO] [logging.py:107:log_dist] [Rank 0] step=1127, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1127 loss: 0.1475 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:32:28,824] [INFO] [logging.py:107:log_dist] [Rank 0] step=1128, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1128 loss: 0.1151 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:32:39,494] [INFO] [logging.py:107:log_dist] [Rank 0] step=1129, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1129 loss: 0.0487 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:32:50,330] [INFO] [logging.py:107:log_dist] [Rank 0] step=1130, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1130 loss: 0.0499 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-05 22:33:01,000] [INFO] [logging.py:107:log_dist] [Rank 0] step=1131, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1131 loss: 0.0788 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:33:11,670] [INFO] [logging.py:107:log_dist] [Rank 0] step=1132, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1132 loss: 0.1099 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:33:22,343] [INFO] [logging.py:107:log_dist] [Rank 0] step=1133, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1133 loss: 0.0386 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:33:33,017] [INFO] [logging.py:107:log_dist] [Rank 0] step=1134, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1134 loss: 0.0744 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:33:43,689] [INFO] [logging.py:107:log_dist] [Rank 0] step=1135, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1135 loss: 0.0647 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:33:54,363] [INFO] [logging.py:107:log_dist] [Rank 0] step=1136, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1136 loss: 0.0667 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:34:05,042] [INFO] [logging.py:107:log_dist] [Rank 0] step=1137, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1137 loss: 0.0415 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 22:34:15,899] [INFO] [logging.py:107:log_dist] [Rank 0] step=1138, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1138 loss: 0.0762 iter time (s): 10.826 samples/sec: 0.092 +[2025-05-05 22:34:26,572] [INFO] [logging.py:107:log_dist] [Rank 0] step=1139, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1139 loss: 0.0511 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:34:37,259] [INFO] [logging.py:107:log_dist] [Rank 0] step=1140, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1140 loss: 0.1823 iter time (s): 10.656 samples/sec: 0.094 +[2025-05-05 22:34:47,929] [INFO] [logging.py:107:log_dist] [Rank 0] step=1141, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1141 loss: 0.0929 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:34:58,607] [INFO] [logging.py:107:log_dist] [Rank 0] step=1142, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1142 loss: 0.0834 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 22:35:09,283] [INFO] [logging.py:107:log_dist] [Rank 0] step=1143, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1143 loss: 0.0535 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:35:19,949] [INFO] [logging.py:107:log_dist] [Rank 0] step=1144, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1144 loss: 0.3154 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 22:35:30,630] [INFO] [logging.py:107:log_dist] [Rank 0] step=1145, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1145 loss: 0.1953 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-05 22:35:41,300] [INFO] [logging.py:107:log_dist] [Rank 0] step=1146, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1146 loss: 0.1048 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:35:52,155] [INFO] [logging.py:107:log_dist] [Rank 0] step=1147, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1147 loss: 0.0436 iter time (s): 10.825 samples/sec: 0.092 +[2025-05-05 22:36:02,824] [INFO] [logging.py:107:log_dist] [Rank 0] step=1148, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1148 loss: 0.5029 iter time (s): 10.642 samples/sec: 0.094 +Started new epoch: 29 +[2025-05-05 22:36:13,837] [INFO] [logging.py:107:log_dist] [Rank 0] step=1149, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1149 loss: 0.0277 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:36:24,503] [INFO] [logging.py:107:log_dist] [Rank 0] step=1150, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1150 loss: 0.0957 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 22:36:35,177] [INFO] [logging.py:107:log_dist] [Rank 0] step=1151, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1151 loss: 0.3905 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:36:45,849] [INFO] [logging.py:107:log_dist] [Rank 0] step=1152, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1152 loss: 0.0383 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:36:56,524] [INFO] [logging.py:107:log_dist] [Rank 0] step=1153, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1153 loss: 0.2104 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:37:07,197] [INFO] [logging.py:107:log_dist] [Rank 0] step=1154, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1154 loss: 0.0370 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:37:17,869] [INFO] [logging.py:107:log_dist] [Rank 0] step=1155, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1155 loss: 0.0462 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:37:28,701] [INFO] [logging.py:107:log_dist] [Rank 0] step=1156, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1156 loss: 0.0923 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 22:37:39,380] [INFO] [logging.py:107:log_dist] [Rank 0] step=1157, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1157 loss: 0.0732 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 22:37:50,048] [INFO] [logging.py:107:log_dist] [Rank 0] step=1158, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1158 loss: 0.0291 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:38:00,722] [INFO] [logging.py:107:log_dist] [Rank 0] step=1159, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1159 loss: 0.1421 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:38:11,398] [INFO] [logging.py:107:log_dist] [Rank 0] step=1160, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1160 loss: 0.0710 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:38:22,070] [INFO] [logging.py:107:log_dist] [Rank 0] step=1161, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1161 loss: 0.0913 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:38:32,738] [INFO] [logging.py:107:log_dist] [Rank 0] step=1162, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1162 loss: 0.0280 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:38:43,412] [INFO] [logging.py:107:log_dist] [Rank 0] step=1163, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1163 loss: 0.0262 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:38:54,272] [INFO] [logging.py:107:log_dist] [Rank 0] step=1164, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1164 loss: 0.0422 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-05 22:39:04,946] [INFO] [logging.py:107:log_dist] [Rank 0] step=1165, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1165 loss: 0.0440 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:39:15,620] [INFO] [logging.py:107:log_dist] [Rank 0] step=1166, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1166 loss: 0.0453 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:39:26,293] [INFO] [logging.py:107:log_dist] [Rank 0] step=1167, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1167 loss: 0.0339 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:39:36,966] [INFO] [logging.py:107:log_dist] [Rank 0] step=1168, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1168 loss: 0.0703 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:39:47,635] [INFO] [logging.py:107:log_dist] [Rank 0] step=1169, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1169 loss: 0.0933 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:39:58,301] [INFO] [logging.py:107:log_dist] [Rank 0] step=1170, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1170 loss: 0.0350 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 22:40:08,978] [INFO] [logging.py:107:log_dist] [Rank 0] step=1171, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1171 loss: 0.0419 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:40:19,652] [INFO] [logging.py:107:log_dist] [Rank 0] step=1172, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1172 loss: 0.3009 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:40:30,510] [INFO] [logging.py:107:log_dist] [Rank 0] step=1173, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1173 loss: 0.0675 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-05 22:40:41,185] [INFO] [logging.py:107:log_dist] [Rank 0] step=1174, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1174 loss: 0.2564 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:40:51,855] [INFO] [logging.py:107:log_dist] [Rank 0] step=1175, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1175 loss: 0.1126 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:41:02,539] [INFO] [logging.py:107:log_dist] [Rank 0] step=1176, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1176 loss: 0.0432 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 22:41:13,214] [INFO] [logging.py:107:log_dist] [Rank 0] step=1177, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1177 loss: 0.0399 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:41:23,888] [INFO] [logging.py:107:log_dist] [Rank 0] step=1178, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1178 loss: 0.1184 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:41:34,558] [INFO] [logging.py:107:log_dist] [Rank 0] step=1179, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1179 loss: 0.0345 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:41:45,229] [INFO] [logging.py:107:log_dist] [Rank 0] step=1180, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1180 loss: 0.1018 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:41:55,904] [INFO] [logging.py:107:log_dist] [Rank 0] step=1181, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1181 loss: 0.0674 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:42:06,736] [INFO] [logging.py:107:log_dist] [Rank 0] step=1182, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1182 loss: 0.1323 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-05 22:42:17,407] [INFO] [logging.py:107:log_dist] [Rank 0] step=1183, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1183 loss: 0.0370 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:42:28,080] [INFO] [logging.py:107:log_dist] [Rank 0] step=1184, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1184 loss: 0.0321 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:42:38,750] [INFO] [logging.py:107:log_dist] [Rank 0] step=1185, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1185 loss: 0.0981 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:42:49,421] [INFO] [logging.py:107:log_dist] [Rank 0] step=1186, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1186 loss: 0.2362 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:43:00,091] [INFO] [logging.py:107:log_dist] [Rank 0] step=1187, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1187 loss: 0.1173 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:43:10,758] [INFO] [logging.py:107:log_dist] [Rank 0] step=1188, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1188 loss: 0.0443 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:43:21,432] [INFO] [logging.py:107:log_dist] [Rank 0] step=1189, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1189 loss: 0.1579 iter time (s): 10.647 samples/sec: 0.094 +Started new epoch: 30 +[2025-05-05 22:43:32,611] [INFO] [logging.py:107:log_dist] [Rank 0] step=1190, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1190 loss: 0.0387 iter time (s): 10.809 samples/sec: 0.093 +[2025-05-05 22:43:43,281] [INFO] [logging.py:107:log_dist] [Rank 0] step=1191, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1191 loss: 0.1977 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:43:53,953] [INFO] [logging.py:107:log_dist] [Rank 0] step=1192, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1192 loss: 0.0662 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:44:04,628] [INFO] [logging.py:107:log_dist] [Rank 0] step=1193, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1193 loss: 0.0359 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:44:15,296] [INFO] [logging.py:107:log_dist] [Rank 0] step=1194, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1194 loss: 0.0932 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:44:25,968] [INFO] [logging.py:107:log_dist] [Rank 0] step=1195, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1195 loss: 0.0447 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:44:36,641] [INFO] [logging.py:107:log_dist] [Rank 0] step=1196, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1196 loss: 0.1233 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:44:47,310] [INFO] [logging.py:107:log_dist] [Rank 0] step=1197, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1197 loss: 0.0568 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:44:57,988] [INFO] [logging.py:107:log_dist] [Rank 0] step=1198, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1198 loss: 0.0432 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:45:08,830] [INFO] [logging.py:107:log_dist] [Rank 0] step=1199, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1199 loss: 0.2255 iter time (s): 10.811 samples/sec: 0.092 +[2025-05-05 22:45:19,500] [INFO] [logging.py:107:log_dist] [Rank 0] step=1200, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1200 loss: 0.1139 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:45:30,177] [INFO] [logging.py:107:log_dist] [Rank 0] step=1201, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1201 loss: 0.0844 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:45:40,847] [INFO] [logging.py:107:log_dist] [Rank 0] step=1202, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1202 loss: 0.1178 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:45:51,518] [INFO] [logging.py:107:log_dist] [Rank 0] step=1203, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1203 loss: 0.1614 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:46:02,191] [INFO] [logging.py:107:log_dist] [Rank 0] step=1204, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1204 loss: 0.0579 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:46:12,861] [INFO] [logging.py:107:log_dist] [Rank 0] step=1205, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1205 loss: 0.0361 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:46:23,530] [INFO] [logging.py:107:log_dist] [Rank 0] step=1206, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1206 loss: 0.3801 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:46:34,396] [INFO] [logging.py:107:log_dist] [Rank 0] step=1207, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1207 loss: 0.0395 iter time (s): 10.835 samples/sec: 0.092 +[2025-05-05 22:46:45,071] [INFO] [logging.py:107:log_dist] [Rank 0] step=1208, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1208 loss: 0.0420 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:46:55,743] [INFO] [logging.py:107:log_dist] [Rank 0] step=1209, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1209 loss: 0.1013 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:47:06,420] [INFO] [logging.py:107:log_dist] [Rank 0] step=1210, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1210 loss: 0.0623 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 22:47:17,093] [INFO] [logging.py:107:log_dist] [Rank 0] step=1211, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1211 loss: 0.0555 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:47:27,763] [INFO] [logging.py:107:log_dist] [Rank 0] step=1212, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1212 loss: 0.0743 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:47:38,445] [INFO] [logging.py:107:log_dist] [Rank 0] step=1213, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1213 loss: 0.1271 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-05 22:47:49,115] [INFO] [logging.py:107:log_dist] [Rank 0] step=1214, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1214 loss: 0.0338 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:47:59,796] [INFO] [logging.py:107:log_dist] [Rank 0] step=1215, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1215 loss: 0.1571 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 22:48:10,666] [INFO] [logging.py:107:log_dist] [Rank 0] step=1216, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1216 loss: 0.1150 iter time (s): 10.831 samples/sec: 0.092 +[2025-05-05 22:48:21,336] [INFO] [logging.py:107:log_dist] [Rank 0] step=1217, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1217 loss: 0.0403 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:48:32,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=1218, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1218 loss: 0.0395 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:48:42,679] [INFO] [logging.py:107:log_dist] [Rank 0] step=1219, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1219 loss: 0.1577 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:48:53,349] [INFO] [logging.py:107:log_dist] [Rank 0] step=1220, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1220 loss: 0.0435 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:49:04,025] [INFO] [logging.py:107:log_dist] [Rank 0] step=1221, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1221 loss: 0.0404 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:49:14,708] [INFO] [logging.py:107:log_dist] [Rank 0] step=1222, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1222 loss: 0.0928 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 22:49:25,383] [INFO] [logging.py:107:log_dist] [Rank 0] step=1223, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1223 loss: 0.1063 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:49:36,058] [INFO] [logging.py:107:log_dist] [Rank 0] step=1224, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1224 loss: 0.0272 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:49:46,892] [INFO] [logging.py:107:log_dist] [Rank 0] step=1225, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1225 loss: 0.0700 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-05 22:49:57,561] [INFO] [logging.py:107:log_dist] [Rank 0] step=1226, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1226 loss: 0.2836 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:50:08,239] [INFO] [logging.py:107:log_dist] [Rank 0] step=1227, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1227 loss: 0.0277 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:50:18,910] [INFO] [logging.py:107:log_dist] [Rank 0] step=1228, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1228 loss: 0.1486 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:50:29,582] [INFO] [logging.py:107:log_dist] [Rank 0] step=1229, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1229 loss: 0.0340 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:50:40,253] [INFO] [logging.py:107:log_dist] [Rank 0] step=1230, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1230 loss: 0.2356 iter time (s): 10.644 samples/sec: 0.094 +Saving model to directory epoch30 +Started new epoch: 31 +[2025-05-05 22:50:52,932] [INFO] [logging.py:107:log_dist] [Rank 0] step=1231, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1231 loss: 0.0796 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:51:03,601] [INFO] [logging.py:107:log_dist] [Rank 0] step=1232, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1232 loss: 0.0374 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:51:14,439] [INFO] [logging.py:107:log_dist] [Rank 0] step=1233, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1233 loss: 0.0419 iter time (s): 10.807 samples/sec: 0.093 +[2025-05-05 22:51:25,107] [INFO] [logging.py:107:log_dist] [Rank 0] step=1234, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1234 loss: 0.0314 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:51:35,777] [INFO] [logging.py:107:log_dist] [Rank 0] step=1235, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1235 loss: 0.1058 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:51:46,453] [INFO] [logging.py:107:log_dist] [Rank 0] step=1236, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1236 loss: 0.0695 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:51:57,123] [INFO] [logging.py:107:log_dist] [Rank 0] step=1237, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1237 loss: 0.1129 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:52:07,795] [INFO] [logging.py:107:log_dist] [Rank 0] step=1238, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1238 loss: 0.1207 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:52:18,468] [INFO] [logging.py:107:log_dist] [Rank 0] step=1239, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1239 loss: 0.0954 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:52:29,141] [INFO] [logging.py:107:log_dist] [Rank 0] step=1240, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1240 loss: 0.1390 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:52:39,999] [INFO] [logging.py:107:log_dist] [Rank 0] step=1241, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1241 loss: 0.0428 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-05 22:52:50,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=1242, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1242 loss: 0.0838 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 22:53:01,346] [INFO] [logging.py:107:log_dist] [Rank 0] step=1243, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1243 loss: 0.0685 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:53:12,016] [INFO] [logging.py:107:log_dist] [Rank 0] step=1244, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1244 loss: 0.0299 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 22:53:22,699] [INFO] [logging.py:107:log_dist] [Rank 0] step=1245, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1245 loss: 0.1325 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:53:33,368] [INFO] [logging.py:107:log_dist] [Rank 0] step=1246, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1246 loss: 0.0894 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:53:44,040] [INFO] [logging.py:107:log_dist] [Rank 0] step=1247, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1247 loss: 0.0348 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:53:54,712] [INFO] [logging.py:107:log_dist] [Rank 0] step=1248, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1248 loss: 0.1230 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:54:05,385] [INFO] [logging.py:107:log_dist] [Rank 0] step=1249, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1249 loss: 0.0450 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:54:16,209] [INFO] [logging.py:107:log_dist] [Rank 0] step=1250, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1250 loss: 0.0583 iter time (s): 10.793 samples/sec: 0.093 +[2025-05-05 22:54:26,891] [INFO] [logging.py:107:log_dist] [Rank 0] step=1251, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1251 loss: 0.0457 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-05 22:54:37,559] [INFO] [logging.py:107:log_dist] [Rank 0] step=1252, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1252 loss: 0.0346 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:54:48,226] [INFO] [logging.py:107:log_dist] [Rank 0] step=1253, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1253 loss: 0.1037 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 22:54:58,901] [INFO] [logging.py:107:log_dist] [Rank 0] step=1254, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1254 loss: 0.0354 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:55:09,576] [INFO] [logging.py:107:log_dist] [Rank 0] step=1255, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1255 loss: 0.0529 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:55:20,249] [INFO] [logging.py:107:log_dist] [Rank 0] step=1256, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1256 loss: 0.0549 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:55:30,921] [INFO] [logging.py:107:log_dist] [Rank 0] step=1257, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1257 loss: 0.0918 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:55:41,590] [INFO] [logging.py:107:log_dist] [Rank 0] step=1258, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1258 loss: 0.1918 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:55:52,456] [INFO] [logging.py:107:log_dist] [Rank 0] step=1259, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1259 loss: 0.3747 iter time (s): 10.825 samples/sec: 0.092 +[2025-05-05 22:56:03,135] [INFO] [logging.py:107:log_dist] [Rank 0] step=1260, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1260 loss: 0.0320 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 22:56:13,808] [INFO] [logging.py:107:log_dist] [Rank 0] step=1261, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1261 loss: 0.0471 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:56:24,483] [INFO] [logging.py:107:log_dist] [Rank 0] step=1262, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1262 loss: 0.0441 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 22:56:35,155] [INFO] [logging.py:107:log_dist] [Rank 0] step=1263, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1263 loss: 0.0338 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:56:45,835] [INFO] [logging.py:107:log_dist] [Rank 0] step=1264, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1264 loss: 0.0452 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:56:56,516] [INFO] [logging.py:107:log_dist] [Rank 0] step=1265, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1265 loss: 0.1297 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 22:57:07,190] [INFO] [logging.py:107:log_dist] [Rank 0] step=1266, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1266 loss: 0.0648 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:57:18,023] [INFO] [logging.py:107:log_dist] [Rank 0] step=1267, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1267 loss: 0.0564 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-05 22:57:28,694] [INFO] [logging.py:107:log_dist] [Rank 0] step=1268, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1268 loss: 0.0809 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 22:57:39,365] [INFO] [logging.py:107:log_dist] [Rank 0] step=1269, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1269 loss: 0.0860 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:57:50,031] [INFO] [logging.py:107:log_dist] [Rank 0] step=1270, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1270 loss: 0.0495 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 22:58:00,697] [INFO] [logging.py:107:log_dist] [Rank 0] step=1271, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1271 loss: 0.1274 iter time (s): 10.639 samples/sec: 0.094 +Started new epoch: 32 +[2025-05-05 22:58:11,718] [INFO] [logging.py:107:log_dist] [Rank 0] step=1272, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1272 loss: 0.0614 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-05 22:58:22,390] [INFO] [logging.py:107:log_dist] [Rank 0] step=1273, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1273 loss: 0.0419 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:58:33,063] [INFO] [logging.py:107:log_dist] [Rank 0] step=1274, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1274 loss: 0.0377 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 22:58:43,732] [INFO] [logging.py:107:log_dist] [Rank 0] step=1275, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1275 loss: 0.1677 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 22:58:54,567] [INFO] [logging.py:107:log_dist] [Rank 0] step=1276, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1276 loss: 0.1186 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-05 22:59:05,237] [INFO] [logging.py:107:log_dist] [Rank 0] step=1277, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1277 loss: 0.0442 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:59:15,905] [INFO] [logging.py:107:log_dist] [Rank 0] step=1278, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1278 loss: 0.0345 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 22:59:26,574] [INFO] [logging.py:107:log_dist] [Rank 0] step=1279, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1279 loss: 0.0854 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 22:59:37,244] [INFO] [logging.py:107:log_dist] [Rank 0] step=1280, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1280 loss: 0.0397 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:59:47,915] [INFO] [logging.py:107:log_dist] [Rank 0] step=1281, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1281 loss: 0.0544 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 22:59:58,589] [INFO] [logging.py:107:log_dist] [Rank 0] step=1282, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1282 loss: 0.2395 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:00:09,271] [INFO] [logging.py:107:log_dist] [Rank 0] step=1283, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1283 loss: 0.0869 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 23:00:20,154] [INFO] [logging.py:107:log_dist] [Rank 0] step=1284, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1284 loss: 0.1336 iter time (s): 10.851 samples/sec: 0.092 +[2025-05-05 23:00:30,824] [INFO] [logging.py:107:log_dist] [Rank 0] step=1285, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1285 loss: 0.0998 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:00:41,496] [INFO] [logging.py:107:log_dist] [Rank 0] step=1286, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1286 loss: 0.0256 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:00:52,166] [INFO] [logging.py:107:log_dist] [Rank 0] step=1287, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1287 loss: 0.0428 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:01:02,835] [INFO] [logging.py:107:log_dist] [Rank 0] step=1288, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1288 loss: 0.0841 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:01:13,511] [INFO] [logging.py:107:log_dist] [Rank 0] step=1289, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1289 loss: 0.0555 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:01:24,182] [INFO] [logging.py:107:log_dist] [Rank 0] step=1290, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1290 loss: 0.3284 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:01:34,853] [INFO] [logging.py:107:log_dist] [Rank 0] step=1291, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1291 loss: 0.0720 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:01:45,523] [INFO] [logging.py:107:log_dist] [Rank 0] step=1292, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1292 loss: 0.0505 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:01:56,358] [INFO] [logging.py:107:log_dist] [Rank 0] step=1293, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1293 loss: 0.1013 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-05 23:02:07,028] [INFO] [logging.py:107:log_dist] [Rank 0] step=1294, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1294 loss: 0.0408 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:02:17,703] [INFO] [logging.py:107:log_dist] [Rank 0] step=1295, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1295 loss: 0.0856 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:02:28,373] [INFO] [logging.py:107:log_dist] [Rank 0] step=1296, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1296 loss: 0.0316 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:02:39,042] [INFO] [logging.py:107:log_dist] [Rank 0] step=1297, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1297 loss: 0.0928 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:02:49,715] [INFO] [logging.py:107:log_dist] [Rank 0] step=1298, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1298 loss: 0.0354 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:03:00,384] [INFO] [logging.py:107:log_dist] [Rank 0] step=1299, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1299 loss: 0.0631 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:03:11,053] [INFO] [logging.py:107:log_dist] [Rank 0] step=1300, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1300 loss: 0.0467 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:03:21,728] [INFO] [logging.py:107:log_dist] [Rank 0] step=1301, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1301 loss: 0.1409 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:03:32,565] [INFO] [logging.py:107:log_dist] [Rank 0] step=1302, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1302 loss: 0.0436 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 23:03:43,235] [INFO] [logging.py:107:log_dist] [Rank 0] step=1303, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1303 loss: 0.3592 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:03:53,905] [INFO] [logging.py:107:log_dist] [Rank 0] step=1304, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1304 loss: 0.0332 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:04:04,576] [INFO] [logging.py:107:log_dist] [Rank 0] step=1305, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1305 loss: 0.0528 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:04:15,245] [INFO] [logging.py:107:log_dist] [Rank 0] step=1306, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1306 loss: 0.1340 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:04:25,921] [INFO] [logging.py:107:log_dist] [Rank 0] step=1307, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1307 loss: 0.0373 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:04:36,590] [INFO] [logging.py:107:log_dist] [Rank 0] step=1308, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1308 loss: 0.0584 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:04:47,257] [INFO] [logging.py:107:log_dist] [Rank 0] step=1309, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1309 loss: 0.1050 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 23:04:57,930] [INFO] [logging.py:107:log_dist] [Rank 0] step=1310, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1310 loss: 0.0584 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:05:08,766] [INFO] [logging.py:107:log_dist] [Rank 0] step=1311, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1311 loss: 0.1038 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-05 23:05:19,430] [INFO] [logging.py:107:log_dist] [Rank 0] step=1312, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1312 loss: 0.0494 iter time (s): 10.638 samples/sec: 0.094 +Started new epoch: 33 +[2025-05-05 23:05:30,445] [INFO] [logging.py:107:log_dist] [Rank 0] step=1313, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1313 loss: 0.0688 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:05:41,114] [INFO] [logging.py:107:log_dist] [Rank 0] step=1314, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1314 loss: 0.0511 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:05:51,794] [INFO] [logging.py:107:log_dist] [Rank 0] step=1315, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1315 loss: 0.0946 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 23:06:02,464] [INFO] [logging.py:107:log_dist] [Rank 0] step=1316, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1316 loss: 0.0361 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:06:13,130] [INFO] [logging.py:107:log_dist] [Rank 0] step=1317, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1317 loss: 0.0739 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-05 23:06:23,805] [INFO] [logging.py:107:log_dist] [Rank 0] step=1318, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1318 loss: 0.0374 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:06:34,662] [INFO] [logging.py:107:log_dist] [Rank 0] step=1319, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1319 loss: 0.0842 iter time (s): 10.825 samples/sec: 0.092 +[2025-05-05 23:06:45,329] [INFO] [logging.py:107:log_dist] [Rank 0] step=1320, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1320 loss: 0.0484 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:06:56,005] [INFO] [logging.py:107:log_dist] [Rank 0] step=1321, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1321 loss: 0.1460 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:07:06,678] [INFO] [logging.py:107:log_dist] [Rank 0] step=1322, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1322 loss: 0.1090 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:07:17,356] [INFO] [logging.py:107:log_dist] [Rank 0] step=1323, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1323 loss: 0.0778 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:07:28,028] [INFO] [logging.py:107:log_dist] [Rank 0] step=1324, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1324 loss: 0.2632 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:07:38,698] [INFO] [logging.py:107:log_dist] [Rank 0] step=1325, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1325 loss: 0.0543 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:07:49,367] [INFO] [logging.py:107:log_dist] [Rank 0] step=1326, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1326 loss: 0.0867 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:08:00,042] [INFO] [logging.py:107:log_dist] [Rank 0] step=1327, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1327 loss: 0.0624 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:08:10,903] [INFO] [logging.py:107:log_dist] [Rank 0] step=1328, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1328 loss: 0.3882 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-05 23:08:21,575] [INFO] [logging.py:107:log_dist] [Rank 0] step=1329, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1329 loss: 0.1038 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:08:32,248] [INFO] [logging.py:107:log_dist] [Rank 0] step=1330, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1330 loss: 0.1167 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:08:42,922] [INFO] [logging.py:107:log_dist] [Rank 0] step=1331, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1331 loss: 0.0844 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:08:53,592] [INFO] [logging.py:107:log_dist] [Rank 0] step=1332, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1332 loss: 0.0572 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:09:04,270] [INFO] [logging.py:107:log_dist] [Rank 0] step=1333, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1333 loss: 0.0750 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:09:14,943] [INFO] [logging.py:107:log_dist] [Rank 0] step=1334, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1334 loss: 0.0534 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:09:25,613] [INFO] [logging.py:107:log_dist] [Rank 0] step=1335, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1335 loss: 0.0393 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:09:36,289] [INFO] [logging.py:107:log_dist] [Rank 0] step=1336, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1336 loss: 0.0819 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:09:47,125] [INFO] [logging.py:107:log_dist] [Rank 0] step=1337, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1337 loss: 0.0724 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-05 23:09:57,797] [INFO] [logging.py:107:log_dist] [Rank 0] step=1338, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1338 loss: 0.0331 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:10:08,479] [INFO] [logging.py:107:log_dist] [Rank 0] step=1339, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1339 loss: 0.0286 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 23:10:19,150] [INFO] [logging.py:107:log_dist] [Rank 0] step=1340, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1340 loss: 0.0740 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:10:29,824] [INFO] [logging.py:107:log_dist] [Rank 0] step=1341, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1341 loss: 0.0512 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:10:40,498] [INFO] [logging.py:107:log_dist] [Rank 0] step=1342, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1342 loss: 0.0621 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:10:51,177] [INFO] [logging.py:107:log_dist] [Rank 0] step=1343, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1343 loss: 0.0520 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 23:11:01,848] [INFO] [logging.py:107:log_dist] [Rank 0] step=1344, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1344 loss: 0.1496 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:11:12,711] [INFO] [logging.py:107:log_dist] [Rank 0] step=1345, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1345 loss: 0.0329 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-05 23:11:23,381] [INFO] [logging.py:107:log_dist] [Rank 0] step=1346, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1346 loss: 0.0419 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:11:34,060] [INFO] [logging.py:107:log_dist] [Rank 0] step=1347, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1347 loss: 0.0675 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:11:34,062] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step1347 is about to be saved! +[2025-05-05 23:11:34,063] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_00-model_states.pt... +[2025-05-05 23:11:34,064] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_00-model_states.pt. +[2025-05-05 23:11:34,070] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_01-model_states.pt... +[2025-05-05 23:11:34,077] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_01-model_states.pt. +[2025-05-05 23:11:34,082] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_02-model_states.pt... +[2025-05-05 23:11:34,089] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_02-model_states.pt. +[2025-05-05 23:11:34,093] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_03-model_states.pt... +[2025-05-05 23:11:34,099] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_03-model_states.pt. +[2025-05-05 23:11:34,103] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_04-model_states.pt... +[2025-05-05 23:11:34,109] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_04-model_states.pt. +[2025-05-05 23:11:34,113] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_05-model_states.pt... +[2025-05-05 23:11:34,120] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_05-model_states.pt. +[2025-05-05 23:11:34,123] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_06-model_states.pt... +[2025-05-05 23:11:34,130] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_06-model_states.pt. +[2025-05-05 23:11:34,133] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_07-model_states.pt... +[2025-05-05 23:11:34,140] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_07-model_states.pt. +[2025-05-05 23:11:34,143] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_08-model_states.pt... +[2025-05-05 23:11:34,150] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_08-model_states.pt. +[2025-05-05 23:11:34,154] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_09-model_states.pt... +[2025-05-05 23:11:34,160] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_09-model_states.pt. +[2025-05-05 23:11:34,164] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_10-model_states.pt... +[2025-05-05 23:11:34,170] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_10-model_states.pt. +[2025-05-05 23:11:34,174] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_11-model_states.pt... +[2025-05-05 23:11:34,180] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_11-model_states.pt. +[2025-05-05 23:11:34,184] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_12-model_states.pt... +[2025-05-05 23:11:34,190] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_12-model_states.pt. +[2025-05-05 23:11:34,193] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_13-model_states.pt... +[2025-05-05 23:11:34,200] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_13-model_states.pt. +[2025-05-05 23:11:34,203] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_14-model_states.pt... +[2025-05-05 23:11:34,210] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_14-model_states.pt. +[2025-05-05 23:11:34,213] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_15-model_states.pt... +[2025-05-05 23:11:34,220] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_15-model_states.pt. +[2025-05-05 23:11:34,223] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_16-model_states.pt... +[2025-05-05 23:11:34,230] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_16-model_states.pt. +[2025-05-05 23:11:34,233] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_17-model_states.pt... +[2025-05-05 23:11:34,240] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_17-model_states.pt. +[2025-05-05 23:11:34,243] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_18-model_states.pt... +[2025-05-05 23:11:34,250] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_18-model_states.pt. +[2025-05-05 23:11:34,253] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_19-model_states.pt... +[2025-05-05 23:11:34,260] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_19-model_states.pt. +[2025-05-05 23:11:34,263] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_20-model_states.pt... +[2025-05-05 23:11:34,269] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_20-model_states.pt. +[2025-05-05 23:11:34,273] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_21-model_states.pt... +[2025-05-05 23:11:34,279] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_21-model_states.pt. +[2025-05-05 23:11:34,283] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_22-model_states.pt... +[2025-05-05 23:11:34,289] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_22-model_states.pt. +[2025-05-05 23:11:34,293] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_23-model_states.pt... +[2025-05-05 23:11:34,299] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_23-model_states.pt. +[2025-05-05 23:11:34,303] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_24-model_states.pt... +[2025-05-05 23:11:34,309] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_24-model_states.pt. +[2025-05-05 23:11:34,312] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_25-model_states.pt... +[2025-05-05 23:11:34,319] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_25-model_states.pt. +[2025-05-05 23:11:34,322] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_26-model_states.pt... +[2025-05-05 23:11:34,329] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_26-model_states.pt. +[2025-05-05 23:11:34,332] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_27-model_states.pt... +[2025-05-05 23:11:34,339] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_27-model_states.pt. +[2025-05-05 23:11:34,342] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_28-model_states.pt... +[2025-05-05 23:11:34,348] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_28-model_states.pt. +[2025-05-05 23:11:34,352] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_29-model_states.pt... +[2025-05-05 23:11:34,358] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_29-model_states.pt. +[2025-05-05 23:11:34,362] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_30-model_states.pt... +[2025-05-05 23:11:34,368] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_30-model_states.pt. +[2025-05-05 23:11:34,371] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_31-model_states.pt... +[2025-05-05 23:11:34,378] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_31-model_states.pt. +[2025-05-05 23:11:34,381] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_32-model_states.pt... +[2025-05-05 23:11:34,388] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_32-model_states.pt. +[2025-05-05 23:11:34,391] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_33-model_states.pt... +[2025-05-05 23:11:34,397] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_33-model_states.pt. +[2025-05-05 23:11:34,401] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_34-model_states.pt... +[2025-05-05 23:11:34,407] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_34-model_states.pt. +[2025-05-05 23:11:34,411] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_35-model_states.pt... +[2025-05-05 23:11:34,417] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_35-model_states.pt. +[2025-05-05 23:11:34,421] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_36-model_states.pt... +[2025-05-05 23:11:34,427] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_36-model_states.pt. +[2025-05-05 23:11:34,431] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_37-model_states.pt... +[2025-05-05 23:11:34,437] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_37-model_states.pt. +[2025-05-05 23:11:34,440] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_38-model_states.pt... +[2025-05-05 23:11:34,447] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_38-model_states.pt. +[2025-05-05 23:11:34,450] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_39-model_states.pt... +[2025-05-05 23:11:34,457] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_39-model_states.pt. +[2025-05-05 23:11:34,460] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_40-model_states.pt... +[2025-05-05 23:11:34,467] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_40-model_states.pt. +[2025-05-05 23:11:34,467] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_41-model_states.pt... +[2025-05-05 23:11:34,467] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/layer_41-model_states.pt. +[2025-05-05 23:11:34,486] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/mp_rank_00_model_states.pt +[2025-05-05 23:11:34,486] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/mp_rank_00_model_states.pt... +[2025-05-05 23:11:35,491] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step1347/mp_rank_00_model_states.pt. +[2025-05-05 23:11:35,492] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1347 is ready now! +[2025-05-05 23:11:46,161] [INFO] [logging.py:107:log_dist] [Rank 0] step=1348, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1348 loss: 0.0487 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:11:56,833] [INFO] [logging.py:107:log_dist] [Rank 0] step=1349, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1349 loss: 0.0541 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:12:07,502] [INFO] [logging.py:107:log_dist] [Rank 0] step=1350, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1350 loss: 0.0492 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:12:18,179] [INFO] [logging.py:107:log_dist] [Rank 0] step=1351, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1351 loss: 0.1119 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:12:28,848] [INFO] [logging.py:107:log_dist] [Rank 0] step=1352, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1352 loss: 0.0468 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:12:39,527] [INFO] [logging.py:107:log_dist] [Rank 0] step=1353, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1353 loss: 0.0495 iter time (s): 10.653 samples/sec: 0.094 +Started new epoch: 34 +[2025-05-05 23:12:50,755] [INFO] [logging.py:107:log_dist] [Rank 0] step=1354, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1354 loss: 0.0857 iter time (s): 10.812 samples/sec: 0.092 +[2025-05-05 23:13:01,425] [INFO] [logging.py:107:log_dist] [Rank 0] step=1355, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1355 loss: 0.2162 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:13:12,098] [INFO] [logging.py:107:log_dist] [Rank 0] step=1356, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1356 loss: 0.0779 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:13:22,774] [INFO] [logging.py:107:log_dist] [Rank 0] step=1357, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1357 loss: 0.0407 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:13:33,443] [INFO] [logging.py:107:log_dist] [Rank 0] step=1358, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1358 loss: 0.1399 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:13:44,112] [INFO] [logging.py:107:log_dist] [Rank 0] step=1359, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1359 loss: 0.0277 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:13:54,786] [INFO] [logging.py:107:log_dist] [Rank 0] step=1360, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1360 loss: 0.0333 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:14:05,456] [INFO] [logging.py:107:log_dist] [Rank 0] step=1361, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1361 loss: 0.1171 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:14:16,313] [INFO] [logging.py:107:log_dist] [Rank 0] step=1362, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1362 loss: 0.0355 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-05 23:14:26,990] [INFO] [logging.py:107:log_dist] [Rank 0] step=1363, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1363 loss: 0.2273 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:14:37,657] [INFO] [logging.py:107:log_dist] [Rank 0] step=1364, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1364 loss: 0.0365 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 23:14:48,347] [INFO] [logging.py:107:log_dist] [Rank 0] step=1365, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1365 loss: 0.0706 iter time (s): 10.659 samples/sec: 0.094 +[2025-05-05 23:14:59,020] [INFO] [logging.py:107:log_dist] [Rank 0] step=1366, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1366 loss: 0.0817 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:15:09,694] [INFO] [logging.py:107:log_dist] [Rank 0] step=1367, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1367 loss: 0.0379 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:15:20,369] [INFO] [logging.py:107:log_dist] [Rank 0] step=1368, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1368 loss: 0.1260 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:15:31,040] [INFO] [logging.py:107:log_dist] [Rank 0] step=1369, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1369 loss: 0.1491 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:15:41,716] [INFO] [logging.py:107:log_dist] [Rank 0] step=1370, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1370 loss: 0.0436 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:15:52,553] [INFO] [logging.py:107:log_dist] [Rank 0] step=1371, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1371 loss: 0.2333 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 23:16:03,226] [INFO] [logging.py:107:log_dist] [Rank 0] step=1372, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1372 loss: 0.0803 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:16:13,894] [INFO] [logging.py:107:log_dist] [Rank 0] step=1373, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1373 loss: 0.0566 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 23:16:24,566] [INFO] [logging.py:107:log_dist] [Rank 0] step=1374, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1374 loss: 0.1024 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:16:35,236] [INFO] [logging.py:107:log_dist] [Rank 0] step=1375, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1375 loss: 0.0441 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:16:45,904] [INFO] [logging.py:107:log_dist] [Rank 0] step=1376, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1376 loss: 0.0488 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:16:56,589] [INFO] [logging.py:107:log_dist] [Rank 0] step=1377, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1377 loss: 0.1379 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-05 23:17:07,264] [INFO] [logging.py:107:log_dist] [Rank 0] step=1378, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1378 loss: 0.0412 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:17:18,121] [INFO] [logging.py:107:log_dist] [Rank 0] step=1379, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1379 loss: 0.1801 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-05 23:17:28,799] [INFO] [logging.py:107:log_dist] [Rank 0] step=1380, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1380 loss: 0.0350 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:17:39,472] [INFO] [logging.py:107:log_dist] [Rank 0] step=1381, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1381 loss: 0.2822 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:17:50,141] [INFO] [logging.py:107:log_dist] [Rank 0] step=1382, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1382 loss: 0.1696 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:18:00,814] [INFO] [logging.py:107:log_dist] [Rank 0] step=1383, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1383 loss: 0.1604 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:18:11,488] [INFO] [logging.py:107:log_dist] [Rank 0] step=1384, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1384 loss: 0.1311 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:18:22,156] [INFO] [logging.py:107:log_dist] [Rank 0] step=1385, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1385 loss: 0.0763 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:18:32,835] [INFO] [logging.py:107:log_dist] [Rank 0] step=1386, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1386 loss: 0.1361 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:18:43,507] [INFO] [logging.py:107:log_dist] [Rank 0] step=1387, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1387 loss: 0.1863 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:18:54,366] [INFO] [logging.py:107:log_dist] [Rank 0] step=1388, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1388 loss: 0.0751 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-05 23:19:05,040] [INFO] [logging.py:107:log_dist] [Rank 0] step=1389, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1389 loss: 0.0688 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:19:15,708] [INFO] [logging.py:107:log_dist] [Rank 0] step=1390, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1390 loss: 0.0731 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:19:26,378] [INFO] [logging.py:107:log_dist] [Rank 0] step=1391, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1391 loss: 0.2152 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:19:37,050] [INFO] [logging.py:107:log_dist] [Rank 0] step=1392, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1392 loss: 0.0621 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:19:47,721] [INFO] [logging.py:107:log_dist] [Rank 0] step=1393, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1393 loss: 0.0435 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:19:58,383] [INFO] [logging.py:107:log_dist] [Rank 0] step=1394, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1394 loss: 0.0796 iter time (s): 10.636 samples/sec: 0.094 +Started new epoch: 35 +[2025-05-05 23:20:09,399] [INFO] [logging.py:107:log_dist] [Rank 0] step=1395, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1395 loss: 0.1108 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-05 23:20:20,068] [INFO] [logging.py:107:log_dist] [Rank 0] step=1396, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1396 loss: 0.0389 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:20:30,894] [INFO] [logging.py:107:log_dist] [Rank 0] step=1397, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1397 loss: 0.0597 iter time (s): 10.795 samples/sec: 0.093 +[2025-05-05 23:20:41,565] [INFO] [logging.py:107:log_dist] [Rank 0] step=1398, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1398 loss: 0.1357 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:20:52,238] [INFO] [logging.py:107:log_dist] [Rank 0] step=1399, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1399 loss: 0.0628 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:21:02,921] [INFO] [logging.py:107:log_dist] [Rank 0] step=1400, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1400 loss: 0.0914 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:21:13,599] [INFO] [logging.py:107:log_dist] [Rank 0] step=1401, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1401 loss: 0.0425 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:21:24,273] [INFO] [logging.py:107:log_dist] [Rank 0] step=1402, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1402 loss: 0.1451 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:21:34,940] [INFO] [logging.py:107:log_dist] [Rank 0] step=1403, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1403 loss: 0.0938 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 23:21:45,614] [INFO] [logging.py:107:log_dist] [Rank 0] step=1404, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1404 loss: 0.0876 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:21:56,457] [INFO] [logging.py:107:log_dist] [Rank 0] step=1405, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1405 loss: 0.0671 iter time (s): 10.812 samples/sec: 0.092 +[2025-05-05 23:22:07,133] [INFO] [logging.py:107:log_dist] [Rank 0] step=1406, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1406 loss: 0.0559 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:22:17,810] [INFO] [logging.py:107:log_dist] [Rank 0] step=1407, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1407 loss: 0.0308 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:22:28,480] [INFO] [logging.py:107:log_dist] [Rank 0] step=1408, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1408 loss: 0.0870 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:22:39,152] [INFO] [logging.py:107:log_dist] [Rank 0] step=1409, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1409 loss: 0.0249 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:22:49,825] [INFO] [logging.py:107:log_dist] [Rank 0] step=1410, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1410 loss: 0.0913 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:23:00,502] [INFO] [logging.py:107:log_dist] [Rank 0] step=1411, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1411 loss: 0.0798 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:23:11,175] [INFO] [logging.py:107:log_dist] [Rank 0] step=1412, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1412 loss: 0.0386 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:23:21,851] [INFO] [logging.py:107:log_dist] [Rank 0] step=1413, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1413 loss: 0.1962 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:23:32,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=1414, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1414 loss: 0.0674 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-05 23:23:43,360] [INFO] [logging.py:107:log_dist] [Rank 0] step=1415, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1415 loss: 0.0732 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:23:54,032] [INFO] [logging.py:107:log_dist] [Rank 0] step=1416, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1416 loss: 0.1841 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:24:04,702] [INFO] [logging.py:107:log_dist] [Rank 0] step=1417, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1417 loss: 0.1050 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:24:15,378] [INFO] [logging.py:107:log_dist] [Rank 0] step=1418, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1418 loss: 0.0493 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:24:26,051] [INFO] [logging.py:107:log_dist] [Rank 0] step=1419, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1419 loss: 0.0703 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:24:36,724] [INFO] [logging.py:107:log_dist] [Rank 0] step=1420, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1420 loss: 0.0548 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:24:47,404] [INFO] [logging.py:107:log_dist] [Rank 0] step=1421, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1421 loss: 0.0928 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 23:24:58,270] [INFO] [logging.py:107:log_dist] [Rank 0] step=1422, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1422 loss: 0.0498 iter time (s): 10.834 samples/sec: 0.092 +[2025-05-05 23:25:08,946] [INFO] [logging.py:107:log_dist] [Rank 0] step=1423, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1423 loss: 0.0261 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:25:19,626] [INFO] [logging.py:107:log_dist] [Rank 0] step=1424, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1424 loss: 0.0416 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 23:25:30,303] [INFO] [logging.py:107:log_dist] [Rank 0] step=1425, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1425 loss: 0.1263 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:25:40,973] [INFO] [logging.py:107:log_dist] [Rank 0] step=1426, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1426 loss: 0.0432 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:25:51,649] [INFO] [logging.py:107:log_dist] [Rank 0] step=1427, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1427 loss: 0.0513 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:26:02,317] [INFO] [logging.py:107:log_dist] [Rank 0] step=1428, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1428 loss: 0.0534 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:26:12,990] [INFO] [logging.py:107:log_dist] [Rank 0] step=1429, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1429 loss: 0.0387 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:26:23,667] [INFO] [logging.py:107:log_dist] [Rank 0] step=1430, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1430 loss: 0.0982 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:26:34,526] [INFO] [logging.py:107:log_dist] [Rank 0] step=1431, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1431 loss: 0.0460 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-05 23:26:45,196] [INFO] [logging.py:107:log_dist] [Rank 0] step=1432, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1432 loss: 0.0322 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:26:55,872] [INFO] [logging.py:107:log_dist] [Rank 0] step=1433, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1433 loss: 0.0949 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:27:06,558] [INFO] [logging.py:107:log_dist] [Rank 0] step=1434, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1434 loss: 0.0359 iter time (s): 10.657 samples/sec: 0.094 +[2025-05-05 23:27:17,229] [INFO] [logging.py:107:log_dist] [Rank 0] step=1435, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1435 loss: 0.0440 iter time (s): 10.642 samples/sec: 0.094 +Started new epoch: 36 +[2025-05-05 23:27:28,239] [INFO] [logging.py:107:log_dist] [Rank 0] step=1436, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1436 loss: 0.0678 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:27:38,910] [INFO] [logging.py:107:log_dist] [Rank 0] step=1437, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1437 loss: 0.0597 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:27:49,579] [INFO] [logging.py:107:log_dist] [Rank 0] step=1438, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1438 loss: 0.0373 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:28:00,249] [INFO] [logging.py:107:log_dist] [Rank 0] step=1439, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1439 loss: 0.0305 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:28:11,081] [INFO] [logging.py:107:log_dist] [Rank 0] step=1440, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1440 loss: 0.2076 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 23:28:21,748] [INFO] [logging.py:107:log_dist] [Rank 0] step=1441, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1441 loss: 0.0565 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:28:32,417] [INFO] [logging.py:107:log_dist] [Rank 0] step=1442, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1442 loss: 0.3259 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:28:43,086] [INFO] [logging.py:107:log_dist] [Rank 0] step=1443, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1443 loss: 0.0537 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:28:53,756] [INFO] [logging.py:107:log_dist] [Rank 0] step=1444, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1444 loss: 0.4038 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:29:04,436] [INFO] [logging.py:107:log_dist] [Rank 0] step=1445, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1445 loss: 0.1740 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 23:29:15,104] [INFO] [logging.py:107:log_dist] [Rank 0] step=1446, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1446 loss: 0.0410 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:29:25,772] [INFO] [logging.py:107:log_dist] [Rank 0] step=1447, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1447 loss: 0.0681 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:29:36,644] [INFO] [logging.py:107:log_dist] [Rank 0] step=1448, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1448 loss: 0.0837 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-05 23:29:47,315] [INFO] [logging.py:107:log_dist] [Rank 0] step=1449, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1449 loss: 0.0279 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:29:57,988] [INFO] [logging.py:107:log_dist] [Rank 0] step=1450, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1450 loss: 0.0682 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:30:08,668] [INFO] [logging.py:107:log_dist] [Rank 0] step=1451, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1451 loss: 0.0361 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 23:30:19,337] [INFO] [logging.py:107:log_dist] [Rank 0] step=1452, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1452 loss: 0.0442 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:30:30,006] [INFO] [logging.py:107:log_dist] [Rank 0] step=1453, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1453 loss: 0.0734 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:30:40,679] [INFO] [logging.py:107:log_dist] [Rank 0] step=1454, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1454 loss: 0.0511 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:30:51,347] [INFO] [logging.py:107:log_dist] [Rank 0] step=1455, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1455 loss: 0.1246 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:31:02,015] [INFO] [logging.py:107:log_dist] [Rank 0] step=1456, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1456 loss: 0.0968 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:31:12,880] [INFO] [logging.py:107:log_dist] [Rank 0] step=1457, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1457 loss: 0.0291 iter time (s): 10.834 samples/sec: 0.092 +[2025-05-05 23:31:23,550] [INFO] [logging.py:107:log_dist] [Rank 0] step=1458, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1458 loss: 0.0437 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:31:34,220] [INFO] [logging.py:107:log_dist] [Rank 0] step=1459, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1459 loss: 0.0503 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:31:44,904] [INFO] [logging.py:107:log_dist] [Rank 0] step=1460, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1460 loss: 0.1592 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:31:55,579] [INFO] [logging.py:107:log_dist] [Rank 0] step=1461, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1461 loss: 0.0617 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:32:06,258] [INFO] [logging.py:107:log_dist] [Rank 0] step=1462, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1462 loss: 0.1599 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 23:32:16,941] [INFO] [logging.py:107:log_dist] [Rank 0] step=1463, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1463 loss: 0.0930 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-05 23:32:27,611] [INFO] [logging.py:107:log_dist] [Rank 0] step=1464, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1464 loss: 0.0626 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:32:38,290] [INFO] [logging.py:107:log_dist] [Rank 0] step=1465, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1465 loss: 0.0460 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-05 23:32:49,117] [INFO] [logging.py:107:log_dist] [Rank 0] step=1466, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1466 loss: 0.1701 iter time (s): 10.796 samples/sec: 0.093 +[2025-05-05 23:32:59,792] [INFO] [logging.py:107:log_dist] [Rank 0] step=1467, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1467 loss: 0.0436 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:33:10,467] [INFO] [logging.py:107:log_dist] [Rank 0] step=1468, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1468 loss: 0.0483 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:33:21,144] [INFO] [logging.py:107:log_dist] [Rank 0] step=1469, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1469 loss: 0.0623 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:33:31,814] [INFO] [logging.py:107:log_dist] [Rank 0] step=1470, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1470 loss: 0.0512 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:33:42,487] [INFO] [logging.py:107:log_dist] [Rank 0] step=1471, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1471 loss: 0.0556 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:33:53,162] [INFO] [logging.py:107:log_dist] [Rank 0] step=1472, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1472 loss: 0.0566 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:34:03,833] [INFO] [logging.py:107:log_dist] [Rank 0] step=1473, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1473 loss: 0.0936 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:34:14,684] [INFO] [logging.py:107:log_dist] [Rank 0] step=1474, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1474 loss: 0.1330 iter time (s): 10.821 samples/sec: 0.092 +[2025-05-05 23:34:25,351] [INFO] [logging.py:107:log_dist] [Rank 0] step=1475, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1475 loss: 0.0818 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 23:34:36,018] [INFO] [logging.py:107:log_dist] [Rank 0] step=1476, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1476 loss: 0.0784 iter time (s): 10.640 samples/sec: 0.094 +Started new epoch: 37 +[2025-05-05 23:34:47,040] [INFO] [logging.py:107:log_dist] [Rank 0] step=1477, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1477 loss: 0.0294 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:34:57,714] [INFO] [logging.py:107:log_dist] [Rank 0] step=1478, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1478 loss: 0.0391 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:35:08,389] [INFO] [logging.py:107:log_dist] [Rank 0] step=1479, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1479 loss: 0.3703 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:35:19,067] [INFO] [logging.py:107:log_dist] [Rank 0] step=1480, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1480 loss: 0.1674 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:35:29,737] [INFO] [logging.py:107:log_dist] [Rank 0] step=1481, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1481 loss: 0.0496 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:35:40,404] [INFO] [logging.py:107:log_dist] [Rank 0] step=1482, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1482 loss: 0.0516 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-05 23:35:51,248] [INFO] [logging.py:107:log_dist] [Rank 0] step=1483, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1483 loss: 0.1796 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-05 23:36:01,918] [INFO] [logging.py:107:log_dist] [Rank 0] step=1484, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1484 loss: 0.1987 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:36:12,593] [INFO] [logging.py:107:log_dist] [Rank 0] step=1485, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1485 loss: 0.0938 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:36:23,272] [INFO] [logging.py:107:log_dist] [Rank 0] step=1486, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1486 loss: 0.0385 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 23:36:33,944] [INFO] [logging.py:107:log_dist] [Rank 0] step=1487, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1487 loss: 0.0389 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:36:44,615] [INFO] [logging.py:107:log_dist] [Rank 0] step=1488, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1488 loss: 0.0436 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:36:55,288] [INFO] [logging.py:107:log_dist] [Rank 0] step=1489, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1489 loss: 0.1495 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:37:05,960] [INFO] [logging.py:107:log_dist] [Rank 0] step=1490, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1490 loss: 0.0319 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:37:16,819] [INFO] [logging.py:107:log_dist] [Rank 0] step=1491, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1491 loss: 0.0421 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-05 23:37:27,495] [INFO] [logging.py:107:log_dist] [Rank 0] step=1492, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1492 loss: 0.1172 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:37:38,167] [INFO] [logging.py:107:log_dist] [Rank 0] step=1493, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1493 loss: 0.0648 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:37:48,839] [INFO] [logging.py:107:log_dist] [Rank 0] step=1494, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1494 loss: 0.0674 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:37:59,519] [INFO] [logging.py:107:log_dist] [Rank 0] step=1495, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1495 loss: 0.0964 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 23:38:10,202] [INFO] [logging.py:107:log_dist] [Rank 0] step=1496, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1496 loss: 0.0845 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:38:20,872] [INFO] [logging.py:107:log_dist] [Rank 0] step=1497, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1497 loss: 0.0283 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:38:31,545] [INFO] [logging.py:107:log_dist] [Rank 0] step=1498, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1498 loss: 0.2045 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:38:42,220] [INFO] [logging.py:107:log_dist] [Rank 0] step=1499, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1499 loss: 0.1435 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:38:53,085] [INFO] [logging.py:107:log_dist] [Rank 0] step=1500, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1500 loss: 0.0374 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-05 23:39:03,767] [INFO] [logging.py:107:log_dist] [Rank 0] step=1501, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1501 loss: 0.0951 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-05 23:39:14,438] [INFO] [logging.py:107:log_dist] [Rank 0] step=1502, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1502 loss: 0.0699 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:39:25,113] [INFO] [logging.py:107:log_dist] [Rank 0] step=1503, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1503 loss: 0.0388 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:39:35,788] [INFO] [logging.py:107:log_dist] [Rank 0] step=1504, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1504 loss: 0.0441 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:39:46,460] [INFO] [logging.py:107:log_dist] [Rank 0] step=1505, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1505 loss: 0.1175 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:39:57,131] [INFO] [logging.py:107:log_dist] [Rank 0] step=1506, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1506 loss: 0.0514 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:40:07,812] [INFO] [logging.py:107:log_dist] [Rank 0] step=1507, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1507 loss: 0.1570 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 23:40:18,485] [INFO] [logging.py:107:log_dist] [Rank 0] step=1508, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1508 loss: 0.2732 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:40:29,314] [INFO] [logging.py:107:log_dist] [Rank 0] step=1509, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1509 loss: 0.1570 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-05 23:40:39,987] [INFO] [logging.py:107:log_dist] [Rank 0] step=1510, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1510 loss: 0.0339 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:40:50,659] [INFO] [logging.py:107:log_dist] [Rank 0] step=1511, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1511 loss: 0.0439 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:41:01,329] [INFO] [logging.py:107:log_dist] [Rank 0] step=1512, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1512 loss: 0.0320 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:41:12,004] [INFO] [logging.py:107:log_dist] [Rank 0] step=1513, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1513 loss: 0.2172 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:41:22,673] [INFO] [logging.py:107:log_dist] [Rank 0] step=1514, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1514 loss: 0.0577 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:41:33,342] [INFO] [logging.py:107:log_dist] [Rank 0] step=1515, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1515 loss: 0.0530 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:41:44,017] [INFO] [logging.py:107:log_dist] [Rank 0] step=1516, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1516 loss: 0.0332 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:41:54,870] [INFO] [logging.py:107:log_dist] [Rank 0] step=1517, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1517 loss: 0.1998 iter time (s): 10.827 samples/sec: 0.092 +Started new epoch: 38 +[2025-05-05 23:42:05,877] [INFO] [logging.py:107:log_dist] [Rank 0] step=1518, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1518 loss: 0.0897 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:42:16,555] [INFO] [logging.py:107:log_dist] [Rank 0] step=1519, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1519 loss: 0.0638 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:42:27,227] [INFO] [logging.py:107:log_dist] [Rank 0] step=1520, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1520 loss: 0.0419 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:42:37,899] [INFO] [logging.py:107:log_dist] [Rank 0] step=1521, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1521 loss: 0.0305 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:42:48,574] [INFO] [logging.py:107:log_dist] [Rank 0] step=1522, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1522 loss: 0.1141 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:42:59,246] [INFO] [logging.py:107:log_dist] [Rank 0] step=1523, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1523 loss: 0.1590 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:43:09,918] [INFO] [logging.py:107:log_dist] [Rank 0] step=1524, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1524 loss: 0.0806 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:43:20,592] [INFO] [logging.py:107:log_dist] [Rank 0] step=1525, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1525 loss: 0.0461 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:43:31,456] [INFO] [logging.py:107:log_dist] [Rank 0] step=1526, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1526 loss: 0.1528 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-05 23:43:42,131] [INFO] [logging.py:107:log_dist] [Rank 0] step=1527, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1527 loss: 0.0343 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:43:52,804] [INFO] [logging.py:107:log_dist] [Rank 0] step=1528, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1528 loss: 0.2769 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:44:03,480] [INFO] [logging.py:107:log_dist] [Rank 0] step=1529, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1529 loss: 0.0615 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:44:14,164] [INFO] [logging.py:107:log_dist] [Rank 0] step=1530, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1530 loss: 0.0720 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:44:24,833] [INFO] [logging.py:107:log_dist] [Rank 0] step=1531, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1531 loss: 0.1155 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:44:35,505] [INFO] [logging.py:107:log_dist] [Rank 0] step=1532, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1532 loss: 0.0249 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:44:46,180] [INFO] [logging.py:107:log_dist] [Rank 0] step=1533, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1533 loss: 0.0347 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:44:56,849] [INFO] [logging.py:107:log_dist] [Rank 0] step=1534, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1534 loss: 0.0408 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:45:07,681] [INFO] [logging.py:107:log_dist] [Rank 0] step=1535, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1535 loss: 0.0421 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-05 23:45:18,355] [INFO] [logging.py:107:log_dist] [Rank 0] step=1536, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1536 loss: 0.0588 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:45:29,027] [INFO] [logging.py:107:log_dist] [Rank 0] step=1537, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1537 loss: 0.0985 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:45:39,696] [INFO] [logging.py:107:log_dist] [Rank 0] step=1538, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1538 loss: 0.0461 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:45:50,370] [INFO] [logging.py:107:log_dist] [Rank 0] step=1539, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1539 loss: 0.0724 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:46:01,038] [INFO] [logging.py:107:log_dist] [Rank 0] step=1540, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1540 loss: 0.0355 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:46:11,709] [INFO] [logging.py:107:log_dist] [Rank 0] step=1541, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1541 loss: 0.0381 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:46:22,382] [INFO] [logging.py:107:log_dist] [Rank 0] step=1542, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1542 loss: 0.0922 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:46:33,225] [INFO] [logging.py:107:log_dist] [Rank 0] step=1543, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1543 loss: 0.1020 iter time (s): 10.810 samples/sec: 0.093 +[2025-05-05 23:46:43,897] [INFO] [logging.py:107:log_dist] [Rank 0] step=1544, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1544 loss: 0.0859 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:46:54,569] [INFO] [logging.py:107:log_dist] [Rank 0] step=1545, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1545 loss: 0.0713 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:47:05,257] [INFO] [logging.py:107:log_dist] [Rank 0] step=1546, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1546 loss: 0.0550 iter time (s): 10.655 samples/sec: 0.094 +[2025-05-05 23:47:15,927] [INFO] [logging.py:107:log_dist] [Rank 0] step=1547, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1547 loss: 0.0918 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:47:26,603] [INFO] [logging.py:107:log_dist] [Rank 0] step=1548, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1548 loss: 0.1333 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:47:37,274] [INFO] [logging.py:107:log_dist] [Rank 0] step=1549, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1549 loss: 0.0596 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:47:47,943] [INFO] [logging.py:107:log_dist] [Rank 0] step=1550, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1550 loss: 0.0332 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:47:58,616] [INFO] [logging.py:107:log_dist] [Rank 0] step=1551, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1551 loss: 0.0515 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:48:09,447] [INFO] [logging.py:107:log_dist] [Rank 0] step=1552, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1552 loss: 0.0994 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-05 23:48:20,119] [INFO] [logging.py:107:log_dist] [Rank 0] step=1553, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1553 loss: 0.0646 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:48:30,791] [INFO] [logging.py:107:log_dist] [Rank 0] step=1554, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1554 loss: 0.0577 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:48:41,460] [INFO] [logging.py:107:log_dist] [Rank 0] step=1555, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1555 loss: 0.1201 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:48:52,127] [INFO] [logging.py:107:log_dist] [Rank 0] step=1556, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1556 loss: 0.1362 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:49:02,799] [INFO] [logging.py:107:log_dist] [Rank 0] step=1557, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1557 loss: 0.0357 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:49:13,465] [INFO] [logging.py:107:log_dist] [Rank 0] step=1558, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1558 loss: 0.0760 iter time (s): 10.639 samples/sec: 0.094 +Started new epoch: 39 +[2025-05-05 23:49:24,481] [INFO] [logging.py:107:log_dist] [Rank 0] step=1559, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1559 loss: 0.0362 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:49:35,341] [INFO] [logging.py:107:log_dist] [Rank 0] step=1560, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1560 loss: 0.0387 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-05 23:49:46,011] [INFO] [logging.py:107:log_dist] [Rank 0] step=1561, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1561 loss: 0.0436 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:49:56,682] [INFO] [logging.py:107:log_dist] [Rank 0] step=1562, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1562 loss: 0.0762 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:50:07,363] [INFO] [logging.py:107:log_dist] [Rank 0] step=1563, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1563 loss: 0.2033 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-05 23:50:18,034] [INFO] [logging.py:107:log_dist] [Rank 0] step=1564, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1564 loss: 0.0299 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:50:28,703] [INFO] [logging.py:107:log_dist] [Rank 0] step=1565, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1565 loss: 0.0680 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:50:39,387] [INFO] [logging.py:107:log_dist] [Rank 0] step=1566, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1566 loss: 0.0547 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-05 23:50:50,060] [INFO] [logging.py:107:log_dist] [Rank 0] step=1567, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1567 loss: 0.0371 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:51:00,731] [INFO] [logging.py:107:log_dist] [Rank 0] step=1568, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1568 loss: 0.3786 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:51:11,598] [INFO] [logging.py:107:log_dist] [Rank 0] step=1569, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1569 loss: 0.2927 iter time (s): 10.836 samples/sec: 0.092 +[2025-05-05 23:51:22,267] [INFO] [logging.py:107:log_dist] [Rank 0] step=1570, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1570 loss: 0.0430 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:51:32,940] [INFO] [logging.py:107:log_dist] [Rank 0] step=1571, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1571 loss: 0.0550 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:51:43,615] [INFO] [logging.py:107:log_dist] [Rank 0] step=1572, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1572 loss: 0.1419 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:51:54,282] [INFO] [logging.py:107:log_dist] [Rank 0] step=1573, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1573 loss: 0.0670 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-05 23:52:04,954] [INFO] [logging.py:107:log_dist] [Rank 0] step=1574, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1574 loss: 0.0532 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:52:15,629] [INFO] [logging.py:107:log_dist] [Rank 0] step=1575, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1575 loss: 0.0320 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:52:26,301] [INFO] [logging.py:107:log_dist] [Rank 0] step=1576, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1576 loss: 0.0621 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:52:36,972] [INFO] [logging.py:107:log_dist] [Rank 0] step=1577, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1577 loss: 0.0398 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:52:47,818] [INFO] [logging.py:107:log_dist] [Rank 0] step=1578, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1578 loss: 0.1081 iter time (s): 10.815 samples/sec: 0.092 +[2025-05-05 23:52:58,488] [INFO] [logging.py:107:log_dist] [Rank 0] step=1579, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1579 loss: 0.0515 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:53:09,163] [INFO] [logging.py:107:log_dist] [Rank 0] step=1580, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1580 loss: 0.0916 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-05 23:53:19,843] [INFO] [logging.py:107:log_dist] [Rank 0] step=1581, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1581 loss: 0.0705 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-05 23:53:30,514] [INFO] [logging.py:107:log_dist] [Rank 0] step=1582, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1582 loss: 0.0308 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:53:41,184] [INFO] [logging.py:107:log_dist] [Rank 0] step=1583, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1583 loss: 0.0896 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:53:51,857] [INFO] [logging.py:107:log_dist] [Rank 0] step=1584, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1584 loss: 0.0622 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:54:02,531] [INFO] [logging.py:107:log_dist] [Rank 0] step=1585, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1585 loss: 0.0366 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:54:13,400] [INFO] [logging.py:107:log_dist] [Rank 0] step=1586, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1586 loss: 0.0750 iter time (s): 10.838 samples/sec: 0.092 +[2025-05-05 23:54:24,076] [INFO] [logging.py:107:log_dist] [Rank 0] step=1587, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1587 loss: 0.0528 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:54:34,745] [INFO] [logging.py:107:log_dist] [Rank 0] step=1588, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1588 loss: 0.3739 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:54:45,417] [INFO] [logging.py:107:log_dist] [Rank 0] step=1589, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1589 loss: 0.0650 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:54:56,092] [INFO] [logging.py:107:log_dist] [Rank 0] step=1590, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1590 loss: 0.0428 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:55:06,765] [INFO] [logging.py:107:log_dist] [Rank 0] step=1591, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1591 loss: 0.0635 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:55:17,436] [INFO] [logging.py:107:log_dist] [Rank 0] step=1592, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1592 loss: 0.0583 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:55:28,107] [INFO] [logging.py:107:log_dist] [Rank 0] step=1593, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1593 loss: 0.0298 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:55:38,775] [INFO] [logging.py:107:log_dist] [Rank 0] step=1594, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1594 loss: 0.0543 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:55:49,633] [INFO] [logging.py:107:log_dist] [Rank 0] step=1595, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1595 loss: 0.0353 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-05 23:56:00,304] [INFO] [logging.py:107:log_dist] [Rank 0] step=1596, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1596 loss: 0.0738 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:56:10,980] [INFO] [logging.py:107:log_dist] [Rank 0] step=1597, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1597 loss: 0.0435 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:56:21,650] [INFO] [logging.py:107:log_dist] [Rank 0] step=1598, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1598 loss: 0.0399 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-05 23:56:32,317] [INFO] [logging.py:107:log_dist] [Rank 0] step=1599, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1599 loss: 0.0678 iter time (s): 10.641 samples/sec: 0.094 +Started new epoch: 40 +[2025-05-05 23:56:43,327] [INFO] [logging.py:107:log_dist] [Rank 0] step=1600, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1600 loss: 0.0363 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:56:54,000] [INFO] [logging.py:107:log_dist] [Rank 0] step=1601, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1601 loss: 0.0476 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:57:04,673] [INFO] [logging.py:107:log_dist] [Rank 0] step=1602, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1602 loss: 0.3247 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:57:15,345] [INFO] [logging.py:107:log_dist] [Rank 0] step=1603, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1603 loss: 0.0643 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:57:26,182] [INFO] [logging.py:107:log_dist] [Rank 0] step=1604, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1604 loss: 0.0314 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-05 23:57:36,853] [INFO] [logging.py:107:log_dist] [Rank 0] step=1605, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1605 loss: 0.0445 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:57:47,521] [INFO] [logging.py:107:log_dist] [Rank 0] step=1606, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1606 loss: 0.3022 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:57:58,196] [INFO] [logging.py:107:log_dist] [Rank 0] step=1607, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1607 loss: 0.0321 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:58:08,868] [INFO] [logging.py:107:log_dist] [Rank 0] step=1608, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1608 loss: 0.0374 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-05 23:58:19,537] [INFO] [logging.py:107:log_dist] [Rank 0] step=1609, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1609 loss: 0.0347 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-05 23:58:30,221] [INFO] [logging.py:107:log_dist] [Rank 0] step=1610, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1610 loss: 0.0505 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-05 23:58:40,895] [INFO] [logging.py:107:log_dist] [Rank 0] step=1611, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1611 loss: 0.0313 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:58:51,729] [INFO] [logging.py:107:log_dist] [Rank 0] step=1612, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1612 loss: 0.2061 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-05 23:59:02,410] [INFO] [logging.py:107:log_dist] [Rank 0] step=1613, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1613 loss: 0.0527 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-05 23:59:13,086] [INFO] [logging.py:107:log_dist] [Rank 0] step=1614, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1614 loss: 0.0396 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-05 23:59:23,755] [INFO] [logging.py:107:log_dist] [Rank 0] step=1615, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1615 loss: 0.0311 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-05 23:59:34,428] [INFO] [logging.py:107:log_dist] [Rank 0] step=1616, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1616 loss: 0.0736 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-05 23:59:45,102] [INFO] [logging.py:107:log_dist] [Rank 0] step=1617, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1617 loss: 0.0714 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-05 23:59:55,776] [INFO] [logging.py:107:log_dist] [Rank 0] step=1618, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1618 loss: 0.0449 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:00:06,458] [INFO] [logging.py:107:log_dist] [Rank 0] step=1619, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1619 loss: 0.0625 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 00:00:17,133] [INFO] [logging.py:107:log_dist] [Rank 0] step=1620, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1620 loss: 0.0434 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:00:27,963] [INFO] [logging.py:107:log_dist] [Rank 0] step=1621, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1621 loss: 0.0549 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-06 00:00:38,634] [INFO] [logging.py:107:log_dist] [Rank 0] step=1622, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1622 loss: 0.1043 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:00:49,305] [INFO] [logging.py:107:log_dist] [Rank 0] step=1623, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1623 loss: 0.0383 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:00:59,973] [INFO] [logging.py:107:log_dist] [Rank 0] step=1624, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1624 loss: 0.0395 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:01:10,654] [INFO] [logging.py:107:log_dist] [Rank 0] step=1625, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1625 loss: 0.0278 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-06 00:01:21,322] [INFO] [logging.py:107:log_dist] [Rank 0] step=1626, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1626 loss: 0.0415 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:01:31,994] [INFO] [logging.py:107:log_dist] [Rank 0] step=1627, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1627 loss: 0.0550 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:01:42,672] [INFO] [logging.py:107:log_dist] [Rank 0] step=1628, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1628 loss: 0.0421 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:01:53,539] [INFO] [logging.py:107:log_dist] [Rank 0] step=1629, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1629 loss: 0.3243 iter time (s): 10.837 samples/sec: 0.092 +[2025-05-06 00:02:04,210] [INFO] [logging.py:107:log_dist] [Rank 0] step=1630, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1630 loss: 0.0355 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:02:14,883] [INFO] [logging.py:107:log_dist] [Rank 0] step=1631, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1631 loss: 0.0409 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:02:25,560] [INFO] [logging.py:107:log_dist] [Rank 0] step=1632, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1632 loss: 0.0375 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:02:36,236] [INFO] [logging.py:107:log_dist] [Rank 0] step=1633, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1633 loss: 0.1742 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:02:46,908] [INFO] [logging.py:107:log_dist] [Rank 0] step=1634, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1634 loss: 0.0259 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:02:57,588] [INFO] [logging.py:107:log_dist] [Rank 0] step=1635, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1635 loss: 0.0829 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 00:03:08,263] [INFO] [logging.py:107:log_dist] [Rank 0] step=1636, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1636 loss: 0.0318 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:03:18,959] [INFO] [logging.py:107:log_dist] [Rank 0] step=1637, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1637 loss: 0.0752 iter time (s): 10.665 samples/sec: 0.094 +[2025-05-06 00:03:29,829] [INFO] [logging.py:107:log_dist] [Rank 0] step=1638, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1638 loss: 0.1744 iter time (s): 10.838 samples/sec: 0.092 +[2025-05-06 00:03:40,504] [INFO] [logging.py:107:log_dist] [Rank 0] step=1639, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1639 loss: 0.0318 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:03:51,172] [INFO] [logging.py:107:log_dist] [Rank 0] step=1640, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1640 loss: 0.0452 iter time (s): 10.641 samples/sec: 0.094 +Saving model to directory epoch40 +Started new epoch: 41 +[2025-05-06 00:04:03,884] [INFO] [logging.py:107:log_dist] [Rank 0] step=1641, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1641 loss: 0.0644 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 00:04:14,555] [INFO] [logging.py:107:log_dist] [Rank 0] step=1642, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1642 loss: 0.0696 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:04:25,224] [INFO] [logging.py:107:log_dist] [Rank 0] step=1643, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1643 loss: 0.0497 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:04:35,891] [INFO] [logging.py:107:log_dist] [Rank 0] step=1644, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1644 loss: 0.0500 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:04:46,566] [INFO] [logging.py:107:log_dist] [Rank 0] step=1645, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1645 loss: 0.0620 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:04:57,397] [INFO] [logging.py:107:log_dist] [Rank 0] step=1646, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1646 loss: 0.0971 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-06 00:05:08,069] [INFO] [logging.py:107:log_dist] [Rank 0] step=1647, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1647 loss: 0.2908 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:05:18,743] [INFO] [logging.py:107:log_dist] [Rank 0] step=1648, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1648 loss: 0.0272 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:05:29,415] [INFO] [logging.py:107:log_dist] [Rank 0] step=1649, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1649 loss: 0.0367 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:05:40,086] [INFO] [logging.py:107:log_dist] [Rank 0] step=1650, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1650 loss: 0.0366 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:05:50,765] [INFO] [logging.py:107:log_dist] [Rank 0] step=1651, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1651 loss: 0.1230 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:06:01,440] [INFO] [logging.py:107:log_dist] [Rank 0] step=1652, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1652 loss: 0.1916 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:06:12,125] [INFO] [logging.py:107:log_dist] [Rank 0] step=1653, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1653 loss: 0.0630 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:06:22,797] [INFO] [logging.py:107:log_dist] [Rank 0] step=1654, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1654 loss: 0.0401 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:06:33,631] [INFO] [logging.py:107:log_dist] [Rank 0] step=1655, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1655 loss: 0.0706 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-06 00:06:44,302] [INFO] [logging.py:107:log_dist] [Rank 0] step=1656, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1656 loss: 0.0636 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:06:54,972] [INFO] [logging.py:107:log_dist] [Rank 0] step=1657, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1657 loss: 0.6011 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:07:05,641] [INFO] [logging.py:107:log_dist] [Rank 0] step=1658, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1658 loss: 0.0688 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:07:16,313] [INFO] [logging.py:107:log_dist] [Rank 0] step=1659, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1659 loss: 0.0861 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:07:26,987] [INFO] [logging.py:107:log_dist] [Rank 0] step=1660, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1660 loss: 0.1712 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:07:37,663] [INFO] [logging.py:107:log_dist] [Rank 0] step=1661, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1661 loss: 0.0296 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:07:48,338] [INFO] [logging.py:107:log_dist] [Rank 0] step=1662, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1662 loss: 0.0531 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:07:59,012] [INFO] [logging.py:107:log_dist] [Rank 0] step=1663, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1663 loss: 0.0857 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:08:09,843] [INFO] [logging.py:107:log_dist] [Rank 0] step=1664, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1664 loss: 0.0464 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 00:08:20,514] [INFO] [logging.py:107:log_dist] [Rank 0] step=1665, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1665 loss: 0.0447 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:08:31,195] [INFO] [logging.py:107:log_dist] [Rank 0] step=1666, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1666 loss: 0.0630 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:08:41,866] [INFO] [logging.py:107:log_dist] [Rank 0] step=1667, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1667 loss: 0.0460 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:08:52,542] [INFO] [logging.py:107:log_dist] [Rank 0] step=1668, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1668 loss: 0.0590 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:09:03,224] [INFO] [logging.py:107:log_dist] [Rank 0] step=1669, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1669 loss: 0.1083 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-06 00:09:13,893] [INFO] [logging.py:107:log_dist] [Rank 0] step=1670, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1670 loss: 0.1237 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:09:24,564] [INFO] [logging.py:107:log_dist] [Rank 0] step=1671, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1671 loss: 0.0993 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:09:35,431] [INFO] [logging.py:107:log_dist] [Rank 0] step=1672, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1672 loss: 0.0676 iter time (s): 10.835 samples/sec: 0.092 +[2025-05-06 00:09:46,100] [INFO] [logging.py:107:log_dist] [Rank 0] step=1673, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1673 loss: 0.0679 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:09:56,771] [INFO] [logging.py:107:log_dist] [Rank 0] step=1674, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1674 loss: 0.1145 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:10:07,452] [INFO] [logging.py:107:log_dist] [Rank 0] step=1675, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1675 loss: 0.0487 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 00:10:18,122] [INFO] [logging.py:107:log_dist] [Rank 0] step=1676, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1676 loss: 0.0414 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:10:28,798] [INFO] [logging.py:107:log_dist] [Rank 0] step=1677, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1677 loss: 0.0748 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:10:39,470] [INFO] [logging.py:107:log_dist] [Rank 0] step=1678, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1678 loss: 0.1031 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:10:50,140] [INFO] [logging.py:107:log_dist] [Rank 0] step=1679, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1679 loss: 0.0701 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:11:00,810] [INFO] [logging.py:107:log_dist] [Rank 0] step=1680, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1680 loss: 0.0476 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:11:11,670] [INFO] [logging.py:107:log_dist] [Rank 0] step=1681, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1681 loss: 0.0505 iter time (s): 10.833 samples/sec: 0.092 +Started new epoch: 42 +[2025-05-06 00:11:22,681] [INFO] [logging.py:107:log_dist] [Rank 0] step=1682, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1682 loss: 0.0297 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:11:33,367] [INFO] [logging.py:107:log_dist] [Rank 0] step=1683, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1683 loss: 0.0757 iter time (s): 10.655 samples/sec: 0.094 +[2025-05-06 00:11:44,044] [INFO] [logging.py:107:log_dist] [Rank 0] step=1684, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1684 loss: 0.1847 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:11:54,714] [INFO] [logging.py:107:log_dist] [Rank 0] step=1685, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1685 loss: 0.2015 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:12:05,389] [INFO] [logging.py:107:log_dist] [Rank 0] step=1686, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1686 loss: 0.1085 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:12:16,064] [INFO] [logging.py:107:log_dist] [Rank 0] step=1687, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1687 loss: 0.0289 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:12:26,742] [INFO] [logging.py:107:log_dist] [Rank 0] step=1688, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1688 loss: 0.0591 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 00:12:37,417] [INFO] [logging.py:107:log_dist] [Rank 0] step=1689, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1689 loss: 0.0280 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:12:48,246] [INFO] [logging.py:107:log_dist] [Rank 0] step=1690, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1690 loss: 0.4408 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 00:12:58,921] [INFO] [logging.py:107:log_dist] [Rank 0] step=1691, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1691 loss: 0.2240 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:13:09,599] [INFO] [logging.py:107:log_dist] [Rank 0] step=1692, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1692 loss: 0.0492 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:13:20,271] [INFO] [logging.py:107:log_dist] [Rank 0] step=1693, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1693 loss: 0.0626 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:13:30,944] [INFO] [logging.py:107:log_dist] [Rank 0] step=1694, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1694 loss: 0.0285 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:13:41,619] [INFO] [logging.py:107:log_dist] [Rank 0] step=1695, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1695 loss: 0.1248 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:13:52,288] [INFO] [logging.py:107:log_dist] [Rank 0] step=1696, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1696 loss: 0.0424 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:14:02,960] [INFO] [logging.py:107:log_dist] [Rank 0] step=1697, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1697 loss: 0.0405 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:14:13,822] [INFO] [logging.py:107:log_dist] [Rank 0] step=1698, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1698 loss: 0.0903 iter time (s): 10.831 samples/sec: 0.092 +[2025-05-06 00:14:24,493] [INFO] [logging.py:107:log_dist] [Rank 0] step=1699, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1699 loss: 0.1249 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:14:35,163] [INFO] [logging.py:107:log_dist] [Rank 0] step=1700, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1700 loss: 0.0313 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:14:45,839] [INFO] [logging.py:107:log_dist] [Rank 0] step=1701, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1701 loss: 0.2059 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:14:56,512] [INFO] [logging.py:107:log_dist] [Rank 0] step=1702, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1702 loss: 0.0319 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:15:07,185] [INFO] [logging.py:107:log_dist] [Rank 0] step=1703, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1703 loss: 0.0521 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:15:17,858] [INFO] [logging.py:107:log_dist] [Rank 0] step=1704, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1704 loss: 0.0697 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:15:28,537] [INFO] [logging.py:107:log_dist] [Rank 0] step=1705, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1705 loss: 0.0452 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 00:15:39,204] [INFO] [logging.py:107:log_dist] [Rank 0] step=1706, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1706 loss: 0.0292 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:15:50,071] [INFO] [logging.py:107:log_dist] [Rank 0] step=1707, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1707 loss: 0.0429 iter time (s): 10.835 samples/sec: 0.092 +[2025-05-06 00:16:00,741] [INFO] [logging.py:107:log_dist] [Rank 0] step=1708, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1708 loss: 0.2088 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:16:11,439] [INFO] [logging.py:107:log_dist] [Rank 0] step=1709, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1709 loss: 0.0569 iter time (s): 10.667 samples/sec: 0.094 +[2025-05-06 00:16:22,116] [INFO] [logging.py:107:log_dist] [Rank 0] step=1710, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1710 loss: 0.1142 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:16:32,786] [INFO] [logging.py:107:log_dist] [Rank 0] step=1711, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1711 loss: 0.1216 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:16:43,457] [INFO] [logging.py:107:log_dist] [Rank 0] step=1712, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1712 loss: 0.1235 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:16:54,132] [INFO] [logging.py:107:log_dist] [Rank 0] step=1713, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1713 loss: 0.0356 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:17:04,809] [INFO] [logging.py:107:log_dist] [Rank 0] step=1714, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1714 loss: 0.0325 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:17:15,483] [INFO] [logging.py:107:log_dist] [Rank 0] step=1715, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1715 loss: 0.1864 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:17:26,318] [INFO] [logging.py:107:log_dist] [Rank 0] step=1716, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1716 loss: 0.2613 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-06 00:17:36,991] [INFO] [logging.py:107:log_dist] [Rank 0] step=1717, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1717 loss: 0.0301 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:17:47,658] [INFO] [logging.py:107:log_dist] [Rank 0] step=1718, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1718 loss: 0.0497 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:17:58,333] [INFO] [logging.py:107:log_dist] [Rank 0] step=1719, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1719 loss: 0.0368 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:18:09,001] [INFO] [logging.py:107:log_dist] [Rank 0] step=1720, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1720 loss: 0.0509 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:18:19,671] [INFO] [logging.py:107:log_dist] [Rank 0] step=1721, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1721 loss: 0.1273 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:18:30,340] [INFO] [logging.py:107:log_dist] [Rank 0] step=1722, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1722 loss: 0.0550 iter time (s): 10.642 samples/sec: 0.094 +Started new epoch: 43 +[2025-05-06 00:18:41,353] [INFO] [logging.py:107:log_dist] [Rank 0] step=1723, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1723 loss: 0.0428 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:18:52,195] [INFO] [logging.py:107:log_dist] [Rank 0] step=1724, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1724 loss: 0.1045 iter time (s): 10.811 samples/sec: 0.093 +[2025-05-06 00:19:02,870] [INFO] [logging.py:107:log_dist] [Rank 0] step=1725, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1725 loss: 0.1449 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:19:13,550] [INFO] [logging.py:107:log_dist] [Rank 0] step=1726, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1726 loss: 0.0982 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 00:19:24,224] [INFO] [logging.py:107:log_dist] [Rank 0] step=1727, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1727 loss: 0.0380 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:19:34,895] [INFO] [logging.py:107:log_dist] [Rank 0] step=1728, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1728 loss: 0.0585 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:19:45,568] [INFO] [logging.py:107:log_dist] [Rank 0] step=1729, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1729 loss: 0.0542 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:19:56,237] [INFO] [logging.py:107:log_dist] [Rank 0] step=1730, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1730 loss: 0.0319 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:20:06,920] [INFO] [logging.py:107:log_dist] [Rank 0] step=1731, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1731 loss: 0.1457 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-06 00:20:17,588] [INFO] [logging.py:107:log_dist] [Rank 0] step=1732, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1732 loss: 0.2069 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:20:28,430] [INFO] [logging.py:107:log_dist] [Rank 0] step=1733, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1733 loss: 0.0658 iter time (s): 10.811 samples/sec: 0.092 +[2025-05-06 00:20:39,105] [INFO] [logging.py:107:log_dist] [Rank 0] step=1734, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1734 loss: 0.0530 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:20:49,777] [INFO] [logging.py:107:log_dist] [Rank 0] step=1735, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1735 loss: 0.2340 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:21:00,491] [INFO] [logging.py:107:log_dist] [Rank 0] step=1736, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1736 loss: 0.0430 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 00:21:11,165] [INFO] [logging.py:107:log_dist] [Rank 0] step=1737, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1737 loss: 0.0458 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:21:21,838] [INFO] [logging.py:107:log_dist] [Rank 0] step=1738, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1738 loss: 0.1131 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:21:32,512] [INFO] [logging.py:107:log_dist] [Rank 0] step=1739, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1739 loss: 0.1273 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:21:43,183] [INFO] [logging.py:107:log_dist] [Rank 0] step=1740, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1740 loss: 0.1376 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:21:54,068] [INFO] [logging.py:107:log_dist] [Rank 0] step=1741, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1741 loss: 0.0938 iter time (s): 10.855 samples/sec: 0.092 +[2025-05-06 00:22:04,761] [INFO] [logging.py:107:log_dist] [Rank 0] step=1742, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1742 loss: 0.0300 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 00:22:15,433] [INFO] [logging.py:107:log_dist] [Rank 0] step=1743, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1743 loss: 0.0633 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:22:26,105] [INFO] [logging.py:107:log_dist] [Rank 0] step=1744, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1744 loss: 0.0459 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:22:36,781] [INFO] [logging.py:107:log_dist] [Rank 0] step=1745, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1745 loss: 0.0795 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:22:47,451] [INFO] [logging.py:107:log_dist] [Rank 0] step=1746, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1746 loss: 0.1338 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:22:58,118] [INFO] [logging.py:107:log_dist] [Rank 0] step=1747, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1747 loss: 0.0395 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 00:23:08,792] [INFO] [logging.py:107:log_dist] [Rank 0] step=1748, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1748 loss: 0.0606 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:23:19,467] [INFO] [logging.py:107:log_dist] [Rank 0] step=1749, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1749 loss: 0.0840 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:23:30,325] [INFO] [logging.py:107:log_dist] [Rank 0] step=1750, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1750 loss: 0.1364 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 00:23:40,999] [INFO] [logging.py:107:log_dist] [Rank 0] step=1751, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1751 loss: 0.3187 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:23:51,670] [INFO] [logging.py:107:log_dist] [Rank 0] step=1752, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1752 loss: 0.0319 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:24:02,341] [INFO] [logging.py:107:log_dist] [Rank 0] step=1753, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1753 loss: 0.0332 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:24:13,019] [INFO] [logging.py:107:log_dist] [Rank 0] step=1754, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1754 loss: 0.0541 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:24:23,691] [INFO] [logging.py:107:log_dist] [Rank 0] step=1755, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1755 loss: 0.0641 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:24:34,359] [INFO] [logging.py:107:log_dist] [Rank 0] step=1756, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1756 loss: 0.0992 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 00:24:45,031] [INFO] [logging.py:107:log_dist] [Rank 0] step=1757, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1757 loss: 0.0786 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:24:55,707] [INFO] [logging.py:107:log_dist] [Rank 0] step=1758, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1758 loss: 0.1103 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:25:06,571] [INFO] [logging.py:107:log_dist] [Rank 0] step=1759, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1759 loss: 0.1366 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-06 00:25:17,239] [INFO] [logging.py:107:log_dist] [Rank 0] step=1760, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1760 loss: 0.1926 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:25:27,913] [INFO] [logging.py:107:log_dist] [Rank 0] step=1761, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1761 loss: 0.0412 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:25:38,581] [INFO] [logging.py:107:log_dist] [Rank 0] step=1762, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1762 loss: 0.0528 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:25:49,251] [INFO] [logging.py:107:log_dist] [Rank 0] step=1763, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1763 loss: 0.0564 iter time (s): 10.644 samples/sec: 0.094 +Started new epoch: 44 +[2025-05-06 00:26:00,256] [INFO] [logging.py:107:log_dist] [Rank 0] step=1764, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1764 loss: 0.0474 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:26:10,930] [INFO] [logging.py:107:log_dist] [Rank 0] step=1765, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1765 loss: 0.0326 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:26:21,603] [INFO] [logging.py:107:log_dist] [Rank 0] step=1766, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1766 loss: 0.0471 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:26:32,434] [INFO] [logging.py:107:log_dist] [Rank 0] step=1767, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1767 loss: 0.0302 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-06 00:26:43,104] [INFO] [logging.py:107:log_dist] [Rank 0] step=1768, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1768 loss: 0.1647 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:26:53,775] [INFO] [logging.py:107:log_dist] [Rank 0] step=1769, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1769 loss: 0.0301 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:27:04,445] [INFO] [logging.py:107:log_dist] [Rank 0] step=1770, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1770 loss: 0.1362 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:27:15,120] [INFO] [logging.py:107:log_dist] [Rank 0] step=1771, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1771 loss: 0.1457 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:27:25,808] [INFO] [logging.py:107:log_dist] [Rank 0] step=1772, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1772 loss: 0.0492 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:27:36,479] [INFO] [logging.py:107:log_dist] [Rank 0] step=1773, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1773 loss: 0.0637 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:27:47,147] [INFO] [logging.py:107:log_dist] [Rank 0] step=1774, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1774 loss: 0.0696 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:27:57,826] [INFO] [logging.py:107:log_dist] [Rank 0] step=1775, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1775 loss: 0.1506 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 00:28:08,683] [INFO] [logging.py:107:log_dist] [Rank 0] step=1776, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1776 loss: 0.0631 iter time (s): 10.826 samples/sec: 0.092 +[2025-05-06 00:28:19,353] [INFO] [logging.py:107:log_dist] [Rank 0] step=1777, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1777 loss: 0.0751 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:28:30,031] [INFO] [logging.py:107:log_dist] [Rank 0] step=1778, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1778 loss: 0.0268 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 00:28:40,701] [INFO] [logging.py:107:log_dist] [Rank 0] step=1779, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1779 loss: 0.1667 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:28:51,372] [INFO] [logging.py:107:log_dist] [Rank 0] step=1780, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1780 loss: 0.1062 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:29:02,043] [INFO] [logging.py:107:log_dist] [Rank 0] step=1781, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1781 loss: 0.0381 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:29:12,715] [INFO] [logging.py:107:log_dist] [Rank 0] step=1782, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1782 loss: 0.0268 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:29:23,385] [INFO] [logging.py:107:log_dist] [Rank 0] step=1783, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1783 loss: 0.2991 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:29:34,220] [INFO] [logging.py:107:log_dist] [Rank 0] step=1784, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1784 loss: 0.1003 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-06 00:29:44,890] [INFO] [logging.py:107:log_dist] [Rank 0] step=1785, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1785 loss: 0.0616 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:29:55,559] [INFO] [logging.py:107:log_dist] [Rank 0] step=1786, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1786 loss: 0.0440 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:30:06,238] [INFO] [logging.py:107:log_dist] [Rank 0] step=1787, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1787 loss: 0.0557 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 00:30:16,907] [INFO] [logging.py:107:log_dist] [Rank 0] step=1788, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1788 loss: 0.2811 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:30:27,583] [INFO] [logging.py:107:log_dist] [Rank 0] step=1789, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1789 loss: 0.2404 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:30:38,262] [INFO] [logging.py:107:log_dist] [Rank 0] step=1790, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1790 loss: 0.1184 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 00:30:48,937] [INFO] [logging.py:107:log_dist] [Rank 0] step=1791, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1791 loss: 0.0726 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:30:59,618] [INFO] [logging.py:107:log_dist] [Rank 0] step=1792, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1792 loss: 0.0387 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 00:31:10,458] [INFO] [logging.py:107:log_dist] [Rank 0] step=1793, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1793 loss: 0.0547 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-06 00:31:21,129] [INFO] [logging.py:107:log_dist] [Rank 0] step=1794, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1794 loss: 0.0892 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:31:31,803] [INFO] [logging.py:107:log_dist] [Rank 0] step=1795, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1795 loss: 0.0627 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:31:42,476] [INFO] [logging.py:107:log_dist] [Rank 0] step=1796, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1796 loss: 0.1483 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:31:53,145] [INFO] [logging.py:107:log_dist] [Rank 0] step=1797, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1797 loss: 0.0384 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:32:03,819] [INFO] [logging.py:107:log_dist] [Rank 0] step=1798, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1798 loss: 0.0239 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:32:14,494] [INFO] [logging.py:107:log_dist] [Rank 0] step=1799, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1799 loss: 0.0342 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:32:25,163] [INFO] [logging.py:107:log_dist] [Rank 0] step=1800, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1800 loss: 0.0407 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:32:35,835] [INFO] [logging.py:107:log_dist] [Rank 0] step=1801, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1801 loss: 0.0281 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:32:46,682] [INFO] [logging.py:107:log_dist] [Rank 0] step=1802, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1802 loss: 0.0606 iter time (s): 10.817 samples/sec: 0.092 +[2025-05-06 00:32:57,353] [INFO] [logging.py:107:log_dist] [Rank 0] step=1803, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1803 loss: 0.0644 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:33:08,018] [INFO] [logging.py:107:log_dist] [Rank 0] step=1804, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1804 loss: 0.0452 iter time (s): 10.639 samples/sec: 0.094 +Started new epoch: 45 +[2025-05-06 00:33:19,021] [INFO] [logging.py:107:log_dist] [Rank 0] step=1805, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1805 loss: 0.0398 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:33:29,691] [INFO] [logging.py:107:log_dist] [Rank 0] step=1806, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1806 loss: 0.1775 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:33:40,373] [INFO] [logging.py:107:log_dist] [Rank 0] step=1807, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1807 loss: 0.1126 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 00:33:51,042] [INFO] [logging.py:107:log_dist] [Rank 0] step=1808, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1808 loss: 0.0928 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:34:01,711] [INFO] [logging.py:107:log_dist] [Rank 0] step=1809, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1809 loss: 0.4717 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:34:12,576] [INFO] [logging.py:107:log_dist] [Rank 0] step=1810, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1810 loss: 0.4218 iter time (s): 10.835 samples/sec: 0.092 +[2025-05-06 00:34:23,247] [INFO] [logging.py:107:log_dist] [Rank 0] step=1811, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1811 loss: 0.0568 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:34:33,920] [INFO] [logging.py:107:log_dist] [Rank 0] step=1812, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1812 loss: 0.1112 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:34:44,588] [INFO] [logging.py:107:log_dist] [Rank 0] step=1813, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1813 loss: 0.0599 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:34:55,259] [INFO] [logging.py:107:log_dist] [Rank 0] step=1814, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1814 loss: 0.1754 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:35:05,932] [INFO] [logging.py:107:log_dist] [Rank 0] step=1815, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1815 loss: 0.0412 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:35:16,604] [INFO] [logging.py:107:log_dist] [Rank 0] step=1816, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1816 loss: 0.0278 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:35:27,275] [INFO] [logging.py:107:log_dist] [Rank 0] step=1817, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1817 loss: 0.1154 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:35:37,949] [INFO] [logging.py:107:log_dist] [Rank 0] step=1818, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1818 loss: 0.0441 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:35:48,813] [INFO] [logging.py:107:log_dist] [Rank 0] step=1819, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1819 loss: 0.0350 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-06 00:35:59,484] [INFO] [logging.py:107:log_dist] [Rank 0] step=1820, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1820 loss: 0.1284 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:36:10,159] [INFO] [logging.py:107:log_dist] [Rank 0] step=1821, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1821 loss: 0.0388 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:36:20,833] [INFO] [logging.py:107:log_dist] [Rank 0] step=1822, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1822 loss: 0.0617 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:36:31,506] [INFO] [logging.py:107:log_dist] [Rank 0] step=1823, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1823 loss: 0.0269 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:36:42,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=1824, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1824 loss: 0.1256 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:36:52,850] [INFO] [logging.py:107:log_dist] [Rank 0] step=1825, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1825 loss: 0.0394 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:37:03,520] [INFO] [logging.py:107:log_dist] [Rank 0] step=1826, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1826 loss: 0.0389 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:37:14,197] [INFO] [logging.py:107:log_dist] [Rank 0] step=1827, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1827 loss: 0.1274 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:37:25,036] [INFO] [logging.py:107:log_dist] [Rank 0] step=1828, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1828 loss: 0.0823 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-06 00:37:35,707] [INFO] [logging.py:107:log_dist] [Rank 0] step=1829, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1829 loss: 0.1431 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:37:46,379] [INFO] [logging.py:107:log_dist] [Rank 0] step=1830, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1830 loss: 0.0606 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:37:57,050] [INFO] [logging.py:107:log_dist] [Rank 0] step=1831, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1831 loss: 0.0559 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:38:07,718] [INFO] [logging.py:107:log_dist] [Rank 0] step=1832, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1832 loss: 0.0397 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:38:18,387] [INFO] [logging.py:107:log_dist] [Rank 0] step=1833, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1833 loss: 0.2102 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:38:29,057] [INFO] [logging.py:107:log_dist] [Rank 0] step=1834, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1834 loss: 0.2450 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:38:39,727] [INFO] [logging.py:107:log_dist] [Rank 0] step=1835, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1835 loss: 0.0433 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:38:50,586] [INFO] [logging.py:107:log_dist] [Rank 0] step=1836, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1836 loss: 0.0919 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 00:39:01,255] [INFO] [logging.py:107:log_dist] [Rank 0] step=1837, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1837 loss: 0.1014 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:39:11,924] [INFO] [logging.py:107:log_dist] [Rank 0] step=1838, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1838 loss: 0.0557 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:39:22,597] [INFO] [logging.py:107:log_dist] [Rank 0] step=1839, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1839 loss: 0.0435 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:39:33,273] [INFO] [logging.py:107:log_dist] [Rank 0] step=1840, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1840 loss: 0.0329 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:39:43,943] [INFO] [logging.py:107:log_dist] [Rank 0] step=1841, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1841 loss: 0.0643 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:39:54,613] [INFO] [logging.py:107:log_dist] [Rank 0] step=1842, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1842 loss: 0.2158 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:40:05,297] [INFO] [logging.py:107:log_dist] [Rank 0] step=1843, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1843 loss: 0.0996 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:40:15,966] [INFO] [logging.py:107:log_dist] [Rank 0] step=1844, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1844 loss: 0.0303 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:40:26,819] [INFO] [logging.py:107:log_dist] [Rank 0] step=1845, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1845 loss: 0.0969 iter time (s): 10.827 samples/sec: 0.092 +Started new epoch: 46 +[2025-05-06 00:40:37,834] [INFO] [logging.py:107:log_dist] [Rank 0] step=1846, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1846 loss: 0.0327 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:40:48,507] [INFO] [logging.py:107:log_dist] [Rank 0] step=1847, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1847 loss: 0.0502 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:40:59,183] [INFO] [logging.py:107:log_dist] [Rank 0] step=1848, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1848 loss: 0.3127 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:41:09,858] [INFO] [logging.py:107:log_dist] [Rank 0] step=1849, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1849 loss: 0.0344 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:41:20,528] [INFO] [logging.py:107:log_dist] [Rank 0] step=1850, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1850 loss: 0.0975 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:41:31,210] [INFO] [logging.py:107:log_dist] [Rank 0] step=1851, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1851 loss: 0.0467 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 00:41:41,882] [INFO] [logging.py:107:log_dist] [Rank 0] step=1852, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1852 loss: 0.0847 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:41:52,553] [INFO] [logging.py:107:log_dist] [Rank 0] step=1853, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1853 loss: 0.0340 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:42:03,383] [INFO] [logging.py:107:log_dist] [Rank 0] step=1854, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1854 loss: 0.0370 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-06 00:42:14,056] [INFO] [logging.py:107:log_dist] [Rank 0] step=1855, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1855 loss: 0.0341 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:42:24,734] [INFO] [logging.py:107:log_dist] [Rank 0] step=1856, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1856 loss: 0.4010 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:42:35,405] [INFO] [logging.py:107:log_dist] [Rank 0] step=1857, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1857 loss: 0.1817 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:42:46,075] [INFO] [logging.py:107:log_dist] [Rank 0] step=1858, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1858 loss: 0.0480 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:42:56,743] [INFO] [logging.py:107:log_dist] [Rank 0] step=1859, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1859 loss: 0.1818 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:43:07,415] [INFO] [logging.py:107:log_dist] [Rank 0] step=1860, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1860 loss: 0.0314 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:43:18,089] [INFO] [logging.py:107:log_dist] [Rank 0] step=1861, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1861 loss: 0.1167 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:43:28,927] [INFO] [logging.py:107:log_dist] [Rank 0] step=1862, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1862 loss: 0.4088 iter time (s): 10.807 samples/sec: 0.093 +[2025-05-06 00:43:39,603] [INFO] [logging.py:107:log_dist] [Rank 0] step=1863, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1863 loss: 0.0972 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:43:50,272] [INFO] [logging.py:107:log_dist] [Rank 0] step=1864, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1864 loss: 0.0733 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:44:00,942] [INFO] [logging.py:107:log_dist] [Rank 0] step=1865, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1865 loss: 0.0419 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:44:11,616] [INFO] [logging.py:107:log_dist] [Rank 0] step=1866, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1866 loss: 0.0248 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:44:22,285] [INFO] [logging.py:107:log_dist] [Rank 0] step=1867, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1867 loss: 0.1004 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:44:32,954] [INFO] [logging.py:107:log_dist] [Rank 0] step=1868, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1868 loss: 0.0623 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:44:43,627] [INFO] [logging.py:107:log_dist] [Rank 0] step=1869, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1869 loss: 0.0305 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:44:54,297] [INFO] [logging.py:107:log_dist] [Rank 0] step=1870, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1870 loss: 0.0812 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:45:05,133] [INFO] [logging.py:107:log_dist] [Rank 0] step=1871, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1871 loss: 0.0429 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-06 00:45:15,812] [INFO] [logging.py:107:log_dist] [Rank 0] step=1872, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1872 loss: 0.0748 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 00:45:26,484] [INFO] [logging.py:107:log_dist] [Rank 0] step=1873, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1873 loss: 0.2045 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:45:37,161] [INFO] [logging.py:107:log_dist] [Rank 0] step=1874, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1874 loss: 0.0440 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:45:47,835] [INFO] [logging.py:107:log_dist] [Rank 0] step=1875, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1875 loss: 0.0321 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:45:58,506] [INFO] [logging.py:107:log_dist] [Rank 0] step=1876, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1876 loss: 0.0433 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:46:09,177] [INFO] [logging.py:107:log_dist] [Rank 0] step=1877, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1877 loss: 0.0899 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:46:19,850] [INFO] [logging.py:107:log_dist] [Rank 0] step=1878, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1878 loss: 0.2194 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:46:30,712] [INFO] [logging.py:107:log_dist] [Rank 0] step=1879, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1879 loss: 0.1436 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-06 00:46:41,383] [INFO] [logging.py:107:log_dist] [Rank 0] step=1880, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1880 loss: 0.0339 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:46:52,058] [INFO] [logging.py:107:log_dist] [Rank 0] step=1881, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1881 loss: 0.1294 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:47:02,725] [INFO] [logging.py:107:log_dist] [Rank 0] step=1882, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1882 loss: 0.3868 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 00:47:13,398] [INFO] [logging.py:107:log_dist] [Rank 0] step=1883, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1883 loss: 0.0481 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:47:24,071] [INFO] [logging.py:107:log_dist] [Rank 0] step=1884, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1884 loss: 0.0361 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:47:34,741] [INFO] [logging.py:107:log_dist] [Rank 0] step=1885, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1885 loss: 0.2250 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:47:45,405] [INFO] [logging.py:107:log_dist] [Rank 0] step=1886, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1886 loss: 0.0594 iter time (s): 10.637 samples/sec: 0.094 +Started new epoch: 47 +[2025-05-06 00:47:56,423] [INFO] [logging.py:107:log_dist] [Rank 0] step=1887, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1887 loss: 0.1826 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:48:07,289] [INFO] [logging.py:107:log_dist] [Rank 0] step=1888, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1888 loss: 0.0864 iter time (s): 10.836 samples/sec: 0.092 +[2025-05-06 00:48:17,961] [INFO] [logging.py:107:log_dist] [Rank 0] step=1889, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1889 loss: 0.0484 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:48:28,636] [INFO] [logging.py:107:log_dist] [Rank 0] step=1890, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1890 loss: 0.0494 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:48:39,310] [INFO] [logging.py:107:log_dist] [Rank 0] step=1891, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1891 loss: 0.0313 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:48:49,979] [INFO] [logging.py:107:log_dist] [Rank 0] step=1892, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1892 loss: 0.2024 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:49:00,656] [INFO] [logging.py:107:log_dist] [Rank 0] step=1893, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1893 loss: 0.0946 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:49:11,328] [INFO] [logging.py:107:log_dist] [Rank 0] step=1894, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1894 loss: 0.0298 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:49:22,005] [INFO] [logging.py:107:log_dist] [Rank 0] step=1895, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1895 loss: 0.0334 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 00:49:32,679] [INFO] [logging.py:107:log_dist] [Rank 0] step=1896, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1896 loss: 0.0476 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:49:43,508] [INFO] [logging.py:107:log_dist] [Rank 0] step=1897, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1897 loss: 0.0697 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 00:49:54,182] [INFO] [logging.py:107:log_dist] [Rank 0] step=1898, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1898 loss: 0.0544 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:50:04,857] [INFO] [logging.py:107:log_dist] [Rank 0] step=1899, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1899 loss: 0.0613 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:50:15,526] [INFO] [logging.py:107:log_dist] [Rank 0] step=1900, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1900 loss: 0.1010 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:50:26,200] [INFO] [logging.py:107:log_dist] [Rank 0] step=1901, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1901 loss: 0.0340 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:50:36,869] [INFO] [logging.py:107:log_dist] [Rank 0] step=1902, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1902 loss: 0.0569 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:50:47,542] [INFO] [logging.py:107:log_dist] [Rank 0] step=1903, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1903 loss: 0.0793 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:50:58,221] [INFO] [logging.py:107:log_dist] [Rank 0] step=1904, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1904 loss: 0.0496 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:51:09,083] [INFO] [logging.py:107:log_dist] [Rank 0] step=1905, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1905 loss: 0.1542 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-06 00:51:19,754] [INFO] [logging.py:107:log_dist] [Rank 0] step=1906, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1906 loss: 0.0541 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:51:30,426] [INFO] [logging.py:107:log_dist] [Rank 0] step=1907, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1907 loss: 0.0730 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:51:41,100] [INFO] [logging.py:107:log_dist] [Rank 0] step=1908, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1908 loss: 0.3214 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:51:51,769] [INFO] [logging.py:107:log_dist] [Rank 0] step=1909, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1909 loss: 0.0689 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:52:02,438] [INFO] [logging.py:107:log_dist] [Rank 0] step=1910, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1910 loss: 0.0345 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:52:13,111] [INFO] [logging.py:107:log_dist] [Rank 0] step=1911, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1911 loss: 0.1147 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:52:23,781] [INFO] [logging.py:107:log_dist] [Rank 0] step=1912, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1912 loss: 0.0774 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:52:34,455] [INFO] [logging.py:107:log_dist] [Rank 0] step=1913, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1913 loss: 0.0968 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:52:45,317] [INFO] [logging.py:107:log_dist] [Rank 0] step=1914, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1914 loss: 0.0384 iter time (s): 10.831 samples/sec: 0.092 +[2025-05-06 00:52:55,989] [INFO] [logging.py:107:log_dist] [Rank 0] step=1915, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1915 loss: 0.1007 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:53:06,663] [INFO] [logging.py:107:log_dist] [Rank 0] step=1916, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1916 loss: 0.0533 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:53:17,334] [INFO] [logging.py:107:log_dist] [Rank 0] step=1917, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1917 loss: 0.0730 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:53:28,004] [INFO] [logging.py:107:log_dist] [Rank 0] step=1918, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1918 loss: 0.2122 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:53:38,683] [INFO] [logging.py:107:log_dist] [Rank 0] step=1919, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1919 loss: 0.0358 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 00:53:49,356] [INFO] [logging.py:107:log_dist] [Rank 0] step=1920, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1920 loss: 0.0302 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:54:00,027] [INFO] [logging.py:107:log_dist] [Rank 0] step=1921, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1921 loss: 0.2266 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:54:10,910] [INFO] [logging.py:107:log_dist] [Rank 0] step=1922, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1922 loss: 0.0615 iter time (s): 10.853 samples/sec: 0.092 +[2025-05-06 00:54:21,578] [INFO] [logging.py:107:log_dist] [Rank 0] step=1923, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1923 loss: 0.0521 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:54:32,247] [INFO] [logging.py:107:log_dist] [Rank 0] step=1924, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1924 loss: 0.0684 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:54:42,922] [INFO] [logging.py:107:log_dist] [Rank 0] step=1925, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1925 loss: 0.0902 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 00:54:53,588] [INFO] [logging.py:107:log_dist] [Rank 0] step=1926, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1926 loss: 0.1653 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 00:55:04,254] [INFO] [logging.py:107:log_dist] [Rank 0] step=1927, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1927 loss: 0.0810 iter time (s): 10.639 samples/sec: 0.094 +Started new epoch: 48 +[2025-05-06 00:55:15,267] [INFO] [logging.py:107:log_dist] [Rank 0] step=1928, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1928 loss: 0.3904 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:55:25,943] [INFO] [logging.py:107:log_dist] [Rank 0] step=1929, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1929 loss: 0.0726 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:55:36,612] [INFO] [logging.py:107:log_dist] [Rank 0] step=1930, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1930 loss: 0.0420 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 00:55:47,463] [INFO] [logging.py:107:log_dist] [Rank 0] step=1931, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1931 loss: 0.0447 iter time (s): 10.807 samples/sec: 0.093 +[2025-05-06 00:55:58,135] [INFO] [logging.py:107:log_dist] [Rank 0] step=1932, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1932 loss: 0.0545 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:56:08,811] [INFO] [logging.py:107:log_dist] [Rank 0] step=1933, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1933 loss: 0.0510 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:56:19,482] [INFO] [logging.py:107:log_dist] [Rank 0] step=1934, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1934 loss: 0.0737 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:56:30,152] [INFO] [logging.py:107:log_dist] [Rank 0] step=1935, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1935 loss: 0.0622 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:56:40,822] [INFO] [logging.py:107:log_dist] [Rank 0] step=1936, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1936 loss: 0.0783 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:56:51,496] [INFO] [logging.py:107:log_dist] [Rank 0] step=1937, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1937 loss: 0.0397 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:57:02,172] [INFO] [logging.py:107:log_dist] [Rank 0] step=1938, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1938 loss: 0.0428 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 00:57:12,844] [INFO] [logging.py:107:log_dist] [Rank 0] step=1939, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1939 loss: 0.1549 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:57:23,681] [INFO] [logging.py:107:log_dist] [Rank 0] step=1940, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1940 loss: 0.1370 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-06 00:57:34,352] [INFO] [logging.py:107:log_dist] [Rank 0] step=1941, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1941 loss: 0.0373 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:57:45,022] [INFO] [logging.py:107:log_dist] [Rank 0] step=1942, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1942 loss: 0.0397 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:57:55,698] [INFO] [logging.py:107:log_dist] [Rank 0] step=1943, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1943 loss: 0.0347 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 00:58:06,368] [INFO] [logging.py:107:log_dist] [Rank 0] step=1944, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1944 loss: 0.0523 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 00:58:17,037] [INFO] [logging.py:107:log_dist] [Rank 0] step=1945, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1945 loss: 0.0510 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:58:27,710] [INFO] [logging.py:107:log_dist] [Rank 0] step=1946, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1946 loss: 0.0546 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:58:38,383] [INFO] [logging.py:107:log_dist] [Rank 0] step=1947, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1947 loss: 0.0847 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 00:58:49,055] [INFO] [logging.py:107:log_dist] [Rank 0] step=1948, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1948 loss: 0.1387 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 00:58:59,897] [INFO] [logging.py:107:log_dist] [Rank 0] step=1949, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1949 loss: 0.1003 iter time (s): 10.811 samples/sec: 0.092 +[2025-05-06 00:59:10,566] [INFO] [logging.py:107:log_dist] [Rank 0] step=1950, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1950 loss: 0.0570 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 00:59:21,239] [INFO] [logging.py:107:log_dist] [Rank 0] step=1951, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1951 loss: 0.1107 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 00:59:31,925] [INFO] [logging.py:107:log_dist] [Rank 0] step=1952, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1952 loss: 0.0923 iter time (s): 10.655 samples/sec: 0.094 +[2025-05-06 00:59:42,598] [INFO] [logging.py:107:log_dist] [Rank 0] step=1953, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1953 loss: 0.0539 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 00:59:53,283] [INFO] [logging.py:107:log_dist] [Rank 0] step=1954, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1954 loss: 0.0357 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-06 01:00:03,958] [INFO] [logging.py:107:log_dist] [Rank 0] step=1955, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1955 loss: 0.2113 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:00:14,644] [INFO] [logging.py:107:log_dist] [Rank 0] step=1956, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1956 loss: 0.1540 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:00:25,523] [INFO] [logging.py:107:log_dist] [Rank 0] step=1957, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1957 loss: 0.0522 iter time (s): 10.838 samples/sec: 0.092 +[2025-05-06 01:00:36,195] [INFO] [logging.py:107:log_dist] [Rank 0] step=1958, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1958 loss: 0.0319 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:00:46,868] [INFO] [logging.py:107:log_dist] [Rank 0] step=1959, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1959 loss: 0.0395 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:00:57,543] [INFO] [logging.py:107:log_dist] [Rank 0] step=1960, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1960 loss: 0.0725 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:01:08,214] [INFO] [logging.py:107:log_dist] [Rank 0] step=1961, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1961 loss: 0.0292 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:01:18,884] [INFO] [logging.py:107:log_dist] [Rank 0] step=1962, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1962 loss: 0.0333 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:01:29,557] [INFO] [logging.py:107:log_dist] [Rank 0] step=1963, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1963 loss: 0.0442 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:01:40,233] [INFO] [logging.py:107:log_dist] [Rank 0] step=1964, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1964 loss: 0.0653 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:01:50,903] [INFO] [logging.py:107:log_dist] [Rank 0] step=1965, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1965 loss: 0.0331 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:02:01,765] [INFO] [logging.py:107:log_dist] [Rank 0] step=1966, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1966 loss: 0.0656 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-06 01:02:12,439] [INFO] [logging.py:107:log_dist] [Rank 0] step=1967, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1967 loss: 0.0533 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:02:23,103] [INFO] [logging.py:107:log_dist] [Rank 0] step=1968, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1968 loss: 0.0627 iter time (s): 10.638 samples/sec: 0.094 +Started new epoch: 49 +[2025-05-06 01:02:34,136] [INFO] [logging.py:107:log_dist] [Rank 0] step=1969, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1969 loss: 0.0421 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 01:02:44,806] [INFO] [logging.py:107:log_dist] [Rank 0] step=1970, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1970 loss: 0.1179 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:02:55,478] [INFO] [logging.py:107:log_dist] [Rank 0] step=1971, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1971 loss: 0.0573 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:03:06,154] [INFO] [logging.py:107:log_dist] [Rank 0] step=1972, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1972 loss: 0.0813 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:03:16,829] [INFO] [logging.py:107:log_dist] [Rank 0] step=1973, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1973 loss: 0.0720 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:03:27,498] [INFO] [logging.py:107:log_dist] [Rank 0] step=1974, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1974 loss: 0.1390 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:03:38,338] [INFO] [logging.py:107:log_dist] [Rank 0] step=1975, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1975 loss: 0.1484 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-06 01:03:49,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=1976, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1976 loss: 0.0293 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:03:59,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=1977, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1977 loss: 0.3854 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:04:10,356] [INFO] [logging.py:107:log_dist] [Rank 0] step=1978, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1978 loss: 0.1328 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:04:21,025] [INFO] [logging.py:107:log_dist] [Rank 0] step=1979, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1979 loss: 0.0496 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:04:31,694] [INFO] [logging.py:107:log_dist] [Rank 0] step=1980, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1980 loss: 0.0449 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:04:42,368] [INFO] [logging.py:107:log_dist] [Rank 0] step=1981, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1981 loss: 0.2301 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:04:53,037] [INFO] [logging.py:107:log_dist] [Rank 0] step=1982, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1982 loss: 0.0443 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:05:03,900] [INFO] [logging.py:107:log_dist] [Rank 0] step=1983, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1983 loss: 0.0685 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-06 01:05:14,573] [INFO] [logging.py:107:log_dist] [Rank 0] step=1984, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1984 loss: 0.0574 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:05:25,249] [INFO] [logging.py:107:log_dist] [Rank 0] step=1985, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1985 loss: 0.2964 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:05:35,922] [INFO] [logging.py:107:log_dist] [Rank 0] step=1986, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1986 loss: 0.1049 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:05:46,596] [INFO] [logging.py:107:log_dist] [Rank 0] step=1987, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1987 loss: 0.1719 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:05:57,269] [INFO] [logging.py:107:log_dist] [Rank 0] step=1988, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1988 loss: 0.0332 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:06:07,939] [INFO] [logging.py:107:log_dist] [Rank 0] step=1989, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1989 loss: 0.0535 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:06:18,612] [INFO] [logging.py:107:log_dist] [Rank 0] step=1990, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1990 loss: 0.0309 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:06:29,285] [INFO] [logging.py:107:log_dist] [Rank 0] step=1991, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1991 loss: 0.0614 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:06:40,148] [INFO] [logging.py:107:log_dist] [Rank 0] step=1992, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1992 loss: 0.0533 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-06 01:06:50,821] [INFO] [logging.py:107:log_dist] [Rank 0] step=1993, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1993 loss: 0.1757 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:07:01,493] [INFO] [logging.py:107:log_dist] [Rank 0] step=1994, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1994 loss: 0.0350 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:07:12,164] [INFO] [logging.py:107:log_dist] [Rank 0] step=1995, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1995 loss: 0.0428 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:07:22,841] [INFO] [logging.py:107:log_dist] [Rank 0] step=1996, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1996 loss: 0.0451 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:07:33,511] [INFO] [logging.py:107:log_dist] [Rank 0] step=1997, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1997 loss: 0.0526 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:07:44,182] [INFO] [logging.py:107:log_dist] [Rank 0] step=1998, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1998 loss: 0.0499 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:07:54,855] [INFO] [logging.py:107:log_dist] [Rank 0] step=1999, skipped=0, lr=[1e-05], mom=[0.0] +steps: 1999 loss: 0.1207 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:08:05,524] [INFO] [logging.py:107:log_dist] [Rank 0] step=2000, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2000 loss: 0.0524 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:08:16,356] [INFO] [logging.py:107:log_dist] [Rank 0] step=2001, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2001 loss: 0.1212 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-06 01:08:27,030] [INFO] [logging.py:107:log_dist] [Rank 0] step=2002, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2002 loss: 0.0551 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:08:37,703] [INFO] [logging.py:107:log_dist] [Rank 0] step=2003, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2003 loss: 0.1043 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:08:48,371] [INFO] [logging.py:107:log_dist] [Rank 0] step=2004, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2004 loss: 0.0475 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:08:59,043] [INFO] [logging.py:107:log_dist] [Rank 0] step=2005, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2005 loss: 0.0712 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:09:09,710] [INFO] [logging.py:107:log_dist] [Rank 0] step=2006, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2006 loss: 0.0271 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:09:20,377] [INFO] [logging.py:107:log_dist] [Rank 0] step=2007, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2007 loss: 0.0806 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:09:31,050] [INFO] [logging.py:107:log_dist] [Rank 0] step=2008, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2008 loss: 0.0503 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:09:41,882] [INFO] [logging.py:107:log_dist] [Rank 0] step=2009, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2009 loss: 0.0699 iter time (s): 10.804 samples/sec: 0.093 +Started new epoch: 50 +[2025-05-06 01:09:52,903] [INFO] [logging.py:107:log_dist] [Rank 0] step=2010, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2010 loss: 0.0465 iter time (s): 10.654 samples/sec: 0.094 +[2025-05-06 01:10:03,578] [INFO] [logging.py:107:log_dist] [Rank 0] step=2011, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2011 loss: 0.0785 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:10:14,260] [INFO] [logging.py:107:log_dist] [Rank 0] step=2012, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2012 loss: 0.1129 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:10:24,942] [INFO] [logging.py:107:log_dist] [Rank 0] step=2013, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2013 loss: 0.0636 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-06 01:10:35,616] [INFO] [logging.py:107:log_dist] [Rank 0] step=2014, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2014 loss: 0.2251 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:10:46,287] [INFO] [logging.py:107:log_dist] [Rank 0] step=2015, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2015 loss: 0.1095 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:10:56,963] [INFO] [logging.py:107:log_dist] [Rank 0] step=2016, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2016 loss: 0.1049 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:11:07,634] [INFO] [logging.py:107:log_dist] [Rank 0] step=2017, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2017 loss: 0.0609 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:11:18,475] [INFO] [logging.py:107:log_dist] [Rank 0] step=2018, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2018 loss: 0.2156 iter time (s): 10.809 samples/sec: 0.093 +[2025-05-06 01:11:29,153] [INFO] [logging.py:107:log_dist] [Rank 0] step=2019, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2019 loss: 0.2296 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 01:11:39,827] [INFO] [logging.py:107:log_dist] [Rank 0] step=2020, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2020 loss: 0.0902 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:11:39,829] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step2020 is about to be saved! +[2025-05-06 01:11:39,831] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_00-model_states.pt... +[2025-05-06 01:11:39,831] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_00-model_states.pt. +[2025-05-06 01:11:39,837] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_01-model_states.pt... +[2025-05-06 01:11:39,844] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_01-model_states.pt. +[2025-05-06 01:11:39,849] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_02-model_states.pt... +[2025-05-06 01:11:39,856] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_02-model_states.pt. +[2025-05-06 01:11:39,860] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_03-model_states.pt... +[2025-05-06 01:11:39,866] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_03-model_states.pt. +[2025-05-06 01:11:39,870] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_04-model_states.pt... +[2025-05-06 01:11:39,876] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_04-model_states.pt. +[2025-05-06 01:11:39,880] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_05-model_states.pt... +[2025-05-06 01:11:39,887] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_05-model_states.pt. +[2025-05-06 01:11:39,890] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_06-model_states.pt... +[2025-05-06 01:11:39,897] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_06-model_states.pt. +[2025-05-06 01:11:39,900] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_07-model_states.pt... +[2025-05-06 01:11:39,907] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_07-model_states.pt. +[2025-05-06 01:11:39,910] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_08-model_states.pt... +[2025-05-06 01:11:39,917] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_08-model_states.pt. +[2025-05-06 01:11:39,920] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_09-model_states.pt... +[2025-05-06 01:11:39,927] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_09-model_states.pt. +[2025-05-06 01:11:39,930] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_10-model_states.pt... +[2025-05-06 01:11:39,936] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_10-model_states.pt. +[2025-05-06 01:11:39,940] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_11-model_states.pt... +[2025-05-06 01:11:39,946] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_11-model_states.pt. +[2025-05-06 01:11:39,950] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_12-model_states.pt... +[2025-05-06 01:11:39,956] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_12-model_states.pt. +[2025-05-06 01:11:39,959] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_13-model_states.pt... +[2025-05-06 01:11:39,966] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_13-model_states.pt. +[2025-05-06 01:11:39,970] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_14-model_states.pt... +[2025-05-06 01:11:39,976] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_14-model_states.pt. +[2025-05-06 01:11:39,980] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_15-model_states.pt... +[2025-05-06 01:11:39,986] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_15-model_states.pt. +[2025-05-06 01:11:39,990] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_16-model_states.pt... +[2025-05-06 01:11:39,997] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_16-model_states.pt. +[2025-05-06 01:11:40,000] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_17-model_states.pt... +[2025-05-06 01:11:40,007] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_17-model_states.pt. +[2025-05-06 01:11:40,010] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_18-model_states.pt... +[2025-05-06 01:11:40,016] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_18-model_states.pt. +[2025-05-06 01:11:40,020] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_19-model_states.pt... +[2025-05-06 01:11:40,026] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_19-model_states.pt. +[2025-05-06 01:11:40,030] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_20-model_states.pt... +[2025-05-06 01:11:40,036] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_20-model_states.pt. +[2025-05-06 01:11:40,040] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_21-model_states.pt... +[2025-05-06 01:11:40,046] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_21-model_states.pt. +[2025-05-06 01:11:40,050] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_22-model_states.pt... +[2025-05-06 01:11:40,057] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_22-model_states.pt. +[2025-05-06 01:11:40,060] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_23-model_states.pt... +[2025-05-06 01:11:40,067] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_23-model_states.pt. +[2025-05-06 01:11:40,070] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_24-model_states.pt... +[2025-05-06 01:11:40,077] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_24-model_states.pt. +[2025-05-06 01:11:40,080] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_25-model_states.pt... +[2025-05-06 01:11:40,086] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_25-model_states.pt. +[2025-05-06 01:11:40,090] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_26-model_states.pt... +[2025-05-06 01:11:40,096] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_26-model_states.pt. +[2025-05-06 01:11:40,100] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_27-model_states.pt... +[2025-05-06 01:11:40,106] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_27-model_states.pt. +[2025-05-06 01:11:40,109] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_28-model_states.pt... +[2025-05-06 01:11:40,116] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_28-model_states.pt. +[2025-05-06 01:11:40,119] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_29-model_states.pt... +[2025-05-06 01:11:40,126] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_29-model_states.pt. +[2025-05-06 01:11:40,129] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_30-model_states.pt... +[2025-05-06 01:11:40,136] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_30-model_states.pt. +[2025-05-06 01:11:40,139] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_31-model_states.pt... +[2025-05-06 01:11:40,146] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_31-model_states.pt. +[2025-05-06 01:11:40,149] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_32-model_states.pt... +[2025-05-06 01:11:40,156] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_32-model_states.pt. +[2025-05-06 01:11:40,159] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_33-model_states.pt... +[2025-05-06 01:11:40,166] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_33-model_states.pt. +[2025-05-06 01:11:40,169] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_34-model_states.pt... +[2025-05-06 01:11:40,175] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_34-model_states.pt. +[2025-05-06 01:11:40,179] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_35-model_states.pt... +[2025-05-06 01:11:40,185] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_35-model_states.pt. +[2025-05-06 01:11:40,188] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_36-model_states.pt... +[2025-05-06 01:11:40,195] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_36-model_states.pt. +[2025-05-06 01:11:40,198] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_37-model_states.pt... +[2025-05-06 01:11:40,205] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_37-model_states.pt. +[2025-05-06 01:11:40,208] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_38-model_states.pt... +[2025-05-06 01:11:40,214] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_38-model_states.pt. +[2025-05-06 01:11:40,218] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_39-model_states.pt... +[2025-05-06 01:11:40,224] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_39-model_states.pt. +[2025-05-06 01:11:40,228] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_40-model_states.pt... +[2025-05-06 01:11:40,234] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_40-model_states.pt. +[2025-05-06 01:11:40,234] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_41-model_states.pt... +[2025-05-06 01:11:40,235] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/layer_41-model_states.pt. +[2025-05-06 01:11:40,253] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/mp_rank_00_model_states.pt +[2025-05-06 01:11:40,253] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/mp_rank_00_model_states.pt... +[2025-05-06 01:11:41,257] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2020/mp_rank_00_model_states.pt. +[2025-05-06 01:11:41,258] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2020 is ready now! +[2025-05-06 01:11:51,927] [INFO] [logging.py:107:log_dist] [Rank 0] step=2021, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2021 loss: 0.2718 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:12:02,608] [INFO] [logging.py:107:log_dist] [Rank 0] step=2022, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2022 loss: 0.0427 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-06 01:12:13,282] [INFO] [logging.py:107:log_dist] [Rank 0] step=2023, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2023 loss: 0.0380 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:12:23,955] [INFO] [logging.py:107:log_dist] [Rank 0] step=2024, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2024 loss: 0.0607 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:12:34,627] [INFO] [logging.py:107:log_dist] [Rank 0] step=2025, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2025 loss: 0.1385 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:12:45,456] [INFO] [logging.py:107:log_dist] [Rank 0] step=2026, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2026 loss: 0.0351 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-06 01:12:56,125] [INFO] [logging.py:107:log_dist] [Rank 0] step=2027, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2027 loss: 0.0503 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:13:06,797] [INFO] [logging.py:107:log_dist] [Rank 0] step=2028, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2028 loss: 0.0573 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:13:17,468] [INFO] [logging.py:107:log_dist] [Rank 0] step=2029, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2029 loss: 0.0342 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:13:28,138] [INFO] [logging.py:107:log_dist] [Rank 0] step=2030, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2030 loss: 0.0263 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:13:38,812] [INFO] [logging.py:107:log_dist] [Rank 0] step=2031, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2031 loss: 0.2446 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:13:49,482] [INFO] [logging.py:107:log_dist] [Rank 0] step=2032, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2032 loss: 0.1563 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:14:00,162] [INFO] [logging.py:107:log_dist] [Rank 0] step=2033, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2033 loss: 0.0616 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 01:14:10,837] [INFO] [logging.py:107:log_dist] [Rank 0] step=2034, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2034 loss: 0.1079 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:14:21,694] [INFO] [logging.py:107:log_dist] [Rank 0] step=2035, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2035 loss: 0.0419 iter time (s): 10.826 samples/sec: 0.092 +[2025-05-06 01:14:32,363] [INFO] [logging.py:107:log_dist] [Rank 0] step=2036, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2036 loss: 0.0516 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:14:43,040] [INFO] [logging.py:107:log_dist] [Rank 0] step=2037, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2037 loss: 0.1060 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:14:53,713] [INFO] [logging.py:107:log_dist] [Rank 0] step=2038, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2038 loss: 0.0533 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:15:04,388] [INFO] [logging.py:107:log_dist] [Rank 0] step=2039, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2039 loss: 0.2504 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:15:15,061] [INFO] [logging.py:107:log_dist] [Rank 0] step=2040, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2040 loss: 0.2479 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:15:25,736] [INFO] [logging.py:107:log_dist] [Rank 0] step=2041, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2041 loss: 0.1085 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:15:36,406] [INFO] [logging.py:107:log_dist] [Rank 0] step=2042, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2042 loss: 0.0306 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:15:47,242] [INFO] [logging.py:107:log_dist] [Rank 0] step=2043, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2043 loss: 0.1405 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-06 01:15:57,915] [INFO] [logging.py:107:log_dist] [Rank 0] step=2044, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2044 loss: 0.0236 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:16:08,591] [INFO] [logging.py:107:log_dist] [Rank 0] step=2045, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2045 loss: 0.0335 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:16:19,268] [INFO] [logging.py:107:log_dist] [Rank 0] step=2046, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2046 loss: 0.1121 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 01:16:29,948] [INFO] [logging.py:107:log_dist] [Rank 0] step=2047, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2047 loss: 0.0303 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:16:40,618] [INFO] [logging.py:107:log_dist] [Rank 0] step=2048, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2048 loss: 0.0949 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:16:51,290] [INFO] [logging.py:107:log_dist] [Rank 0] step=2049, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2049 loss: 0.1006 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:17:01,956] [INFO] [logging.py:107:log_dist] [Rank 0] step=2050, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2050 loss: 0.1120 iter time (s): 10.640 samples/sec: 0.094 +Saving model to directory epoch50 +Started new epoch: 51 +[2025-05-06 01:17:14,759] [INFO] [logging.py:107:log_dist] [Rank 0] step=2051, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2051 loss: 0.0332 iter time (s): 10.859 samples/sec: 0.092 +[2025-05-06 01:17:25,429] [INFO] [logging.py:107:log_dist] [Rank 0] step=2052, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2052 loss: 0.0523 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:17:36,099] [INFO] [logging.py:107:log_dist] [Rank 0] step=2053, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2053 loss: 0.0719 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:17:46,773] [INFO] [logging.py:107:log_dist] [Rank 0] step=2054, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2054 loss: 0.0581 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:17:57,443] [INFO] [logging.py:107:log_dist] [Rank 0] step=2055, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2055 loss: 0.1000 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:18:08,111] [INFO] [logging.py:107:log_dist] [Rank 0] step=2056, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2056 loss: 0.0968 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:18:18,696] [INFO] [logging.py:107:log_dist] [Rank 0] step=2057, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2057 loss: 0.0397 iter time (s): 10.554 samples/sec: 0.095 +[2025-05-06 01:18:29,365] [INFO] [logging.py:107:log_dist] [Rank 0] step=2058, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2058 loss: 0.0293 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:18:40,036] [INFO] [logging.py:107:log_dist] [Rank 0] step=2059, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2059 loss: 0.0372 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:18:50,916] [INFO] [logging.py:107:log_dist] [Rank 0] step=2060, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2060 loss: 0.0331 iter time (s): 10.841 samples/sec: 0.092 +[2025-05-06 01:19:01,590] [INFO] [logging.py:107:log_dist] [Rank 0] step=2061, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2061 loss: 0.0941 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:19:12,261] [INFO] [logging.py:107:log_dist] [Rank 0] step=2062, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2062 loss: 0.0361 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:19:22,933] [INFO] [logging.py:107:log_dist] [Rank 0] step=2063, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2063 loss: 0.0485 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:19:33,604] [INFO] [logging.py:107:log_dist] [Rank 0] step=2064, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2064 loss: 0.0466 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:19:44,274] [INFO] [logging.py:107:log_dist] [Rank 0] step=2065, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2065 loss: 0.0259 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:19:54,958] [INFO] [logging.py:107:log_dist] [Rank 0] step=2066, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2066 loss: 0.0335 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-06 01:20:05,635] [INFO] [logging.py:107:log_dist] [Rank 0] step=2067, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2067 loss: 0.0367 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:20:16,470] [INFO] [logging.py:107:log_dist] [Rank 0] step=2068, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2068 loss: 0.0874 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-06 01:20:27,146] [INFO] [logging.py:107:log_dist] [Rank 0] step=2069, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2069 loss: 0.0789 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:20:37,818] [INFO] [logging.py:107:log_dist] [Rank 0] step=2070, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2070 loss: 0.0425 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:20:48,490] [INFO] [logging.py:107:log_dist] [Rank 0] step=2071, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2071 loss: 0.0378 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:20:59,160] [INFO] [logging.py:107:log_dist] [Rank 0] step=2072, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2072 loss: 0.0724 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:21:09,830] [INFO] [logging.py:107:log_dist] [Rank 0] step=2073, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2073 loss: 0.3531 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:21:20,504] [INFO] [logging.py:107:log_dist] [Rank 0] step=2074, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2074 loss: 0.0291 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:21:31,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=2075, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2075 loss: 0.0766 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:21:41,852] [INFO] [logging.py:107:log_dist] [Rank 0] step=2076, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2076 loss: 0.3209 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:21:52,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=2077, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2077 loss: 0.0326 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-06 01:22:03,377] [INFO] [logging.py:107:log_dist] [Rank 0] step=2078, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2078 loss: 0.0594 iter time (s): 10.659 samples/sec: 0.094 +[2025-05-06 01:22:14,077] [INFO] [logging.py:107:log_dist] [Rank 0] step=2079, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2079 loss: 0.0393 iter time (s): 10.668 samples/sec: 0.094 +[2025-05-06 01:22:24,747] [INFO] [logging.py:107:log_dist] [Rank 0] step=2080, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2080 loss: 0.0526 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:22:35,420] [INFO] [logging.py:107:log_dist] [Rank 0] step=2081, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2081 loss: 0.1504 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:22:46,095] [INFO] [logging.py:107:log_dist] [Rank 0] step=2082, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2082 loss: 0.2623 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:22:56,769] [INFO] [logging.py:107:log_dist] [Rank 0] step=2083, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2083 loss: 0.1473 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:23:07,443] [INFO] [logging.py:107:log_dist] [Rank 0] step=2084, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2084 loss: 0.0887 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:23:18,115] [INFO] [logging.py:107:log_dist] [Rank 0] step=2085, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2085 loss: 0.1463 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:23:28,948] [INFO] [logging.py:107:log_dist] [Rank 0] step=2086, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2086 loss: 0.0295 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-06 01:23:39,620] [INFO] [logging.py:107:log_dist] [Rank 0] step=2087, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2087 loss: 0.0594 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:23:50,300] [INFO] [logging.py:107:log_dist] [Rank 0] step=2088, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2088 loss: 0.0697 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:24:00,969] [INFO] [logging.py:107:log_dist] [Rank 0] step=2089, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2089 loss: 0.0602 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:24:11,642] [INFO] [logging.py:107:log_dist] [Rank 0] step=2090, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2090 loss: 0.0562 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:24:22,306] [INFO] [logging.py:107:log_dist] [Rank 0] step=2091, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2091 loss: 0.0497 iter time (s): 10.637 samples/sec: 0.094 +Started new epoch: 52 +[2025-05-06 01:24:33,313] [INFO] [logging.py:107:log_dist] [Rank 0] step=2092, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2092 loss: 0.0495 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:24:43,985] [INFO] [logging.py:107:log_dist] [Rank 0] step=2093, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2093 loss: 0.3474 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:24:54,847] [INFO] [logging.py:107:log_dist] [Rank 0] step=2094, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2094 loss: 0.1288 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-06 01:25:05,520] [INFO] [logging.py:107:log_dist] [Rank 0] step=2095, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2095 loss: 0.0796 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:25:16,188] [INFO] [logging.py:107:log_dist] [Rank 0] step=2096, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2096 loss: 0.0329 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:25:26,860] [INFO] [logging.py:107:log_dist] [Rank 0] step=2097, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2097 loss: 0.0286 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:25:37,531] [INFO] [logging.py:107:log_dist] [Rank 0] step=2098, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2098 loss: 0.0489 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:25:48,208] [INFO] [logging.py:107:log_dist] [Rank 0] step=2099, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2099 loss: 0.0326 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:25:58,879] [INFO] [logging.py:107:log_dist] [Rank 0] step=2100, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2100 loss: 0.0565 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:26:09,546] [INFO] [logging.py:107:log_dist] [Rank 0] step=2101, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2101 loss: 0.0509 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 01:26:20,218] [INFO] [logging.py:107:log_dist] [Rank 0] step=2102, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2102 loss: 0.1572 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:26:31,077] [INFO] [logging.py:107:log_dist] [Rank 0] step=2103, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2103 loss: 0.0681 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 01:26:41,762] [INFO] [logging.py:107:log_dist] [Rank 0] step=2104, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2104 loss: 0.0691 iter time (s): 10.654 samples/sec: 0.094 +[2025-05-06 01:26:52,435] [INFO] [logging.py:107:log_dist] [Rank 0] step=2105, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2105 loss: 0.0388 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:27:03,105] [INFO] [logging.py:107:log_dist] [Rank 0] step=2106, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2106 loss: 0.0875 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:27:13,778] [INFO] [logging.py:107:log_dist] [Rank 0] step=2107, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2107 loss: 0.0357 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:27:24,451] [INFO] [logging.py:107:log_dist] [Rank 0] step=2108, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2108 loss: 0.0276 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:27:35,119] [INFO] [logging.py:107:log_dist] [Rank 0] step=2109, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2109 loss: 0.0428 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 01:27:45,789] [INFO] [logging.py:107:log_dist] [Rank 0] step=2110, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2110 loss: 0.0447 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:27:56,458] [INFO] [logging.py:107:log_dist] [Rank 0] step=2111, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2111 loss: 0.0448 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:28:07,286] [INFO] [logging.py:107:log_dist] [Rank 0] step=2112, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2112 loss: 0.1431 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-06 01:28:17,959] [INFO] [logging.py:107:log_dist] [Rank 0] step=2113, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2113 loss: 0.0705 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:28:28,631] [INFO] [logging.py:107:log_dist] [Rank 0] step=2114, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2114 loss: 0.0316 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:28:39,307] [INFO] [logging.py:107:log_dist] [Rank 0] step=2115, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2115 loss: 0.0602 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:28:49,985] [INFO] [logging.py:107:log_dist] [Rank 0] step=2116, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2116 loss: 0.0425 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 01:29:00,655] [INFO] [logging.py:107:log_dist] [Rank 0] step=2117, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2117 loss: 0.0457 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:29:11,325] [INFO] [logging.py:107:log_dist] [Rank 0] step=2118, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2118 loss: 0.0332 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:29:21,999] [INFO] [logging.py:107:log_dist] [Rank 0] step=2119, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2119 loss: 0.1620 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:29:32,859] [INFO] [logging.py:107:log_dist] [Rank 0] step=2120, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2120 loss: 0.1020 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 01:29:43,529] [INFO] [logging.py:107:log_dist] [Rank 0] step=2121, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2121 loss: 0.0661 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:29:54,200] [INFO] [logging.py:107:log_dist] [Rank 0] step=2122, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2122 loss: 0.0321 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:30:04,889] [INFO] [logging.py:107:log_dist] [Rank 0] step=2123, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2123 loss: 0.0420 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 01:30:15,558] [INFO] [logging.py:107:log_dist] [Rank 0] step=2124, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2124 loss: 0.0400 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:30:26,232] [INFO] [logging.py:107:log_dist] [Rank 0] step=2125, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2125 loss: 0.2313 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:30:36,902] [INFO] [logging.py:107:log_dist] [Rank 0] step=2126, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2126 loss: 0.1171 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:30:47,571] [INFO] [logging.py:107:log_dist] [Rank 0] step=2127, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2127 loss: 0.0371 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:30:58,245] [INFO] [logging.py:107:log_dist] [Rank 0] step=2128, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2128 loss: 0.0518 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:31:09,101] [INFO] [logging.py:107:log_dist] [Rank 0] step=2129, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2129 loss: 0.0363 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 01:31:19,771] [INFO] [logging.py:107:log_dist] [Rank 0] step=2130, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2130 loss: 0.0519 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:31:30,446] [INFO] [logging.py:107:log_dist] [Rank 0] step=2131, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2131 loss: 0.1496 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:31:41,110] [INFO] [logging.py:107:log_dist] [Rank 0] step=2132, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2132 loss: 0.0589 iter time (s): 10.636 samples/sec: 0.094 +Started new epoch: 53 +[2025-05-06 01:31:52,121] [INFO] [logging.py:107:log_dist] [Rank 0] step=2133, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2133 loss: 0.1685 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:32:02,791] [INFO] [logging.py:107:log_dist] [Rank 0] step=2134, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2134 loss: 0.0628 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:32:13,465] [INFO] [logging.py:107:log_dist] [Rank 0] step=2135, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2135 loss: 0.0617 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:32:24,167] [INFO] [logging.py:107:log_dist] [Rank 0] step=2136, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2136 loss: 0.0408 iter time (s): 10.671 samples/sec: 0.094 +[2025-05-06 01:32:34,839] [INFO] [logging.py:107:log_dist] [Rank 0] step=2137, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2137 loss: 0.0355 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:32:45,663] [INFO] [logging.py:107:log_dist] [Rank 0] step=2138, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2138 loss: 0.0694 iter time (s): 10.793 samples/sec: 0.093 +[2025-05-06 01:32:56,334] [INFO] [logging.py:107:log_dist] [Rank 0] step=2139, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2139 loss: 0.0464 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:33:07,005] [INFO] [logging.py:107:log_dist] [Rank 0] step=2140, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2140 loss: 0.0963 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:33:17,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=2141, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2141 loss: 0.0323 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:33:28,346] [INFO] [logging.py:107:log_dist] [Rank 0] step=2142, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2142 loss: 0.0837 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:33:39,019] [INFO] [logging.py:107:log_dist] [Rank 0] step=2143, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2143 loss: 0.0349 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:33:49,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=2144, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2144 loss: 0.1530 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:34:00,355] [INFO] [logging.py:107:log_dist] [Rank 0] step=2145, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2145 loss: 0.0527 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:34:11,206] [INFO] [logging.py:107:log_dist] [Rank 0] step=2146, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2146 loss: 0.1442 iter time (s): 10.819 samples/sec: 0.092 +[2025-05-06 01:34:21,881] [INFO] [logging.py:107:log_dist] [Rank 0] step=2147, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2147 loss: 0.0542 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:34:32,548] [INFO] [logging.py:107:log_dist] [Rank 0] step=2148, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2148 loss: 0.0606 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:34:43,218] [INFO] [logging.py:107:log_dist] [Rank 0] step=2149, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2149 loss: 0.0666 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:34:53,887] [INFO] [logging.py:107:log_dist] [Rank 0] step=2150, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2150 loss: 0.0342 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:35:04,567] [INFO] [logging.py:107:log_dist] [Rank 0] step=2151, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2151 loss: 0.1191 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 01:35:15,242] [INFO] [logging.py:107:log_dist] [Rank 0] step=2152, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2152 loss: 0.0634 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:35:25,914] [INFO] [logging.py:107:log_dist] [Rank 0] step=2153, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2153 loss: 0.0460 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:35:36,587] [INFO] [logging.py:107:log_dist] [Rank 0] step=2154, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2154 loss: 0.1147 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:35:47,436] [INFO] [logging.py:107:log_dist] [Rank 0] step=2155, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2155 loss: 0.0500 iter time (s): 10.817 samples/sec: 0.092 +[2025-05-06 01:35:58,106] [INFO] [logging.py:107:log_dist] [Rank 0] step=2156, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2156 loss: 0.0339 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:36:08,797] [INFO] [logging.py:107:log_dist] [Rank 0] step=2157, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2157 loss: 0.0407 iter time (s): 10.659 samples/sec: 0.094 +[2025-05-06 01:36:19,468] [INFO] [logging.py:107:log_dist] [Rank 0] step=2158, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2158 loss: 0.0761 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:36:30,135] [INFO] [logging.py:107:log_dist] [Rank 0] step=2159, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2159 loss: 0.0342 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 01:36:40,807] [INFO] [logging.py:107:log_dist] [Rank 0] step=2160, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2160 loss: 0.0429 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:36:51,481] [INFO] [logging.py:107:log_dist] [Rank 0] step=2161, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2161 loss: 0.1086 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:37:02,150] [INFO] [logging.py:107:log_dist] [Rank 0] step=2162, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2162 loss: 0.0469 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:37:13,018] [INFO] [logging.py:107:log_dist] [Rank 0] step=2163, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2163 loss: 0.0311 iter time (s): 10.837 samples/sec: 0.092 +[2025-05-06 01:37:23,693] [INFO] [logging.py:107:log_dist] [Rank 0] step=2164, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2164 loss: 0.0459 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:37:34,363] [INFO] [logging.py:107:log_dist] [Rank 0] step=2165, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2165 loss: 0.0287 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:37:45,032] [INFO] [logging.py:107:log_dist] [Rank 0] step=2166, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2166 loss: 0.0737 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:37:55,703] [INFO] [logging.py:107:log_dist] [Rank 0] step=2167, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2167 loss: 0.1656 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:38:06,371] [INFO] [logging.py:107:log_dist] [Rank 0] step=2168, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2168 loss: 0.0327 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:38:17,042] [INFO] [logging.py:107:log_dist] [Rank 0] step=2169, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2169 loss: 0.0440 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:38:27,709] [INFO] [logging.py:107:log_dist] [Rank 0] step=2170, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2170 loss: 0.1336 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:38:38,382] [INFO] [logging.py:107:log_dist] [Rank 0] step=2171, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2171 loss: 0.0782 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:38:49,247] [INFO] [logging.py:107:log_dist] [Rank 0] step=2172, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2172 loss: 0.1450 iter time (s): 10.836 samples/sec: 0.092 +[2025-05-06 01:38:59,912] [INFO] [logging.py:107:log_dist] [Rank 0] step=2173, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2173 loss: 0.1152 iter time (s): 10.638 samples/sec: 0.094 +Started new epoch: 54 +[2025-05-06 01:39:10,923] [INFO] [logging.py:107:log_dist] [Rank 0] step=2174, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2174 loss: 0.0380 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:39:21,610] [INFO] [logging.py:107:log_dist] [Rank 0] step=2175, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2175 loss: 0.0329 iter time (s): 10.657 samples/sec: 0.094 +[2025-05-06 01:39:32,279] [INFO] [logging.py:107:log_dist] [Rank 0] step=2176, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2176 loss: 0.0631 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:39:42,945] [INFO] [logging.py:107:log_dist] [Rank 0] step=2177, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2177 loss: 0.0518 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 01:39:53,621] [INFO] [logging.py:107:log_dist] [Rank 0] step=2178, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2178 loss: 0.1004 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:40:04,301] [INFO] [logging.py:107:log_dist] [Rank 0] step=2179, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2179 loss: 0.0467 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 01:40:14,972] [INFO] [logging.py:107:log_dist] [Rank 0] step=2180, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2180 loss: 0.1556 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:40:25,803] [INFO] [logging.py:107:log_dist] [Rank 0] step=2181, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2181 loss: 0.0340 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-06 01:40:36,475] [INFO] [logging.py:107:log_dist] [Rank 0] step=2182, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2182 loss: 0.0408 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:40:47,146] [INFO] [logging.py:107:log_dist] [Rank 0] step=2183, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2183 loss: 0.1336 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:40:57,816] [INFO] [logging.py:107:log_dist] [Rank 0] step=2184, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2184 loss: 0.0376 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:41:08,488] [INFO] [logging.py:107:log_dist] [Rank 0] step=2185, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2185 loss: 0.0406 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:41:19,161] [INFO] [logging.py:107:log_dist] [Rank 0] step=2186, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2186 loss: 0.1231 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:41:29,831] [INFO] [logging.py:107:log_dist] [Rank 0] step=2187, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2187 loss: 0.0761 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:41:40,499] [INFO] [logging.py:107:log_dist] [Rank 0] step=2188, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2188 loss: 0.1173 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:41:51,359] [INFO] [logging.py:107:log_dist] [Rank 0] step=2189, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2189 loss: 0.3461 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-06 01:42:02,034] [INFO] [logging.py:107:log_dist] [Rank 0] step=2190, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2190 loss: 0.0424 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:42:12,703] [INFO] [logging.py:107:log_dist] [Rank 0] step=2191, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2191 loss: 0.0393 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:42:23,380] [INFO] [logging.py:107:log_dist] [Rank 0] step=2192, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2192 loss: 0.0815 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 01:42:34,059] [INFO] [logging.py:107:log_dist] [Rank 0] step=2193, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2193 loss: 0.0736 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 01:42:44,728] [INFO] [logging.py:107:log_dist] [Rank 0] step=2194, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2194 loss: 0.0352 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:42:55,397] [INFO] [logging.py:107:log_dist] [Rank 0] step=2195, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2195 loss: 0.1722 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:43:06,079] [INFO] [logging.py:107:log_dist] [Rank 0] step=2196, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2196 loss: 0.0566 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 01:43:16,751] [INFO] [logging.py:107:log_dist] [Rank 0] step=2197, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2197 loss: 0.0535 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:43:27,611] [INFO] [logging.py:107:log_dist] [Rank 0] step=2198, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2198 loss: 0.0449 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 01:43:38,285] [INFO] [logging.py:107:log_dist] [Rank 0] step=2199, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2199 loss: 0.0466 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:43:48,956] [INFO] [logging.py:107:log_dist] [Rank 0] step=2200, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2200 loss: 0.0636 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:43:59,626] [INFO] [logging.py:107:log_dist] [Rank 0] step=2201, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2201 loss: 0.0322 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:44:10,304] [INFO] [logging.py:107:log_dist] [Rank 0] step=2202, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2202 loss: 0.0714 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 01:44:20,971] [INFO] [logging.py:107:log_dist] [Rank 0] step=2203, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2203 loss: 0.0292 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 01:44:31,639] [INFO] [logging.py:107:log_dist] [Rank 0] step=2204, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2204 loss: 0.2600 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:44:42,325] [INFO] [logging.py:107:log_dist] [Rank 0] step=2205, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2205 loss: 0.0714 iter time (s): 10.655 samples/sec: 0.094 +[2025-05-06 01:44:52,996] [INFO] [logging.py:107:log_dist] [Rank 0] step=2206, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2206 loss: 0.0819 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:45:03,822] [INFO] [logging.py:107:log_dist] [Rank 0] step=2207, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2207 loss: 0.0445 iter time (s): 10.795 samples/sec: 0.093 +[2025-05-06 01:45:14,496] [INFO] [logging.py:107:log_dist] [Rank 0] step=2208, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2208 loss: 0.1276 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:45:25,165] [INFO] [logging.py:107:log_dist] [Rank 0] step=2209, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2209 loss: 0.0618 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:45:35,831] [INFO] [logging.py:107:log_dist] [Rank 0] step=2210, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2210 loss: 0.0532 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 01:45:46,502] [INFO] [logging.py:107:log_dist] [Rank 0] step=2211, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2211 loss: 0.0992 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:45:57,173] [INFO] [logging.py:107:log_dist] [Rank 0] step=2212, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2212 loss: 0.0499 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:46:07,874] [INFO] [logging.py:107:log_dist] [Rank 0] step=2213, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2213 loss: 0.0318 iter time (s): 10.671 samples/sec: 0.094 +[2025-05-06 01:46:18,544] [INFO] [logging.py:107:log_dist] [Rank 0] step=2214, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2214 loss: 0.1501 iter time (s): 10.640 samples/sec: 0.094 +Started new epoch: 55 +[2025-05-06 01:46:29,723] [INFO] [logging.py:107:log_dist] [Rank 0] step=2215, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2215 loss: 0.0802 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-06 01:46:40,399] [INFO] [logging.py:107:log_dist] [Rank 0] step=2216, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2216 loss: 0.0449 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:46:51,073] [INFO] [logging.py:107:log_dist] [Rank 0] step=2217, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2217 loss: 0.3360 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:47:01,744] [INFO] [logging.py:107:log_dist] [Rank 0] step=2218, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2218 loss: 0.0601 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:47:12,420] [INFO] [logging.py:107:log_dist] [Rank 0] step=2219, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2219 loss: 0.0291 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 01:47:23,095] [INFO] [logging.py:107:log_dist] [Rank 0] step=2220, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2220 loss: 0.0489 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:47:33,766] [INFO] [logging.py:107:log_dist] [Rank 0] step=2221, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2221 loss: 0.1072 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:47:44,440] [INFO] [logging.py:107:log_dist] [Rank 0] step=2222, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2222 loss: 0.0272 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:47:55,116] [INFO] [logging.py:107:log_dist] [Rank 0] step=2223, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2223 loss: 0.0862 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:48:05,976] [INFO] [logging.py:107:log_dist] [Rank 0] step=2224, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2224 loss: 0.0386 iter time (s): 10.824 samples/sec: 0.092 +[2025-05-06 01:48:16,663] [INFO] [logging.py:107:log_dist] [Rank 0] step=2225, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2225 loss: 0.0377 iter time (s): 10.656 samples/sec: 0.094 +[2025-05-06 01:48:27,334] [INFO] [logging.py:107:log_dist] [Rank 0] step=2226, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2226 loss: 0.0340 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:48:38,005] [INFO] [logging.py:107:log_dist] [Rank 0] step=2227, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2227 loss: 0.0398 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:48:48,682] [INFO] [logging.py:107:log_dist] [Rank 0] step=2228, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2228 loss: 0.1036 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 01:48:59,356] [INFO] [logging.py:107:log_dist] [Rank 0] step=2229, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2229 loss: 0.0420 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:49:10,025] [INFO] [logging.py:107:log_dist] [Rank 0] step=2230, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2230 loss: 0.0517 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:49:20,697] [INFO] [logging.py:107:log_dist] [Rank 0] step=2231, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2231 loss: 0.1635 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:49:31,528] [INFO] [logging.py:107:log_dist] [Rank 0] step=2232, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2232 loss: 0.1799 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 01:49:42,198] [INFO] [logging.py:107:log_dist] [Rank 0] step=2233, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2233 loss: 0.1031 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:49:52,871] [INFO] [logging.py:107:log_dist] [Rank 0] step=2234, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2234 loss: 0.0361 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:50:03,541] [INFO] [logging.py:107:log_dist] [Rank 0] step=2235, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2235 loss: 0.0892 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:50:14,208] [INFO] [logging.py:107:log_dist] [Rank 0] step=2236, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2236 loss: 0.0344 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 01:50:24,892] [INFO] [logging.py:107:log_dist] [Rank 0] step=2237, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2237 loss: 0.1556 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:50:35,562] [INFO] [logging.py:107:log_dist] [Rank 0] step=2238, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2238 loss: 0.1528 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:50:46,229] [INFO] [logging.py:107:log_dist] [Rank 0] step=2239, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2239 loss: 0.0385 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:50:56,899] [INFO] [logging.py:107:log_dist] [Rank 0] step=2240, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2240 loss: 0.1473 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:51:07,728] [INFO] [logging.py:107:log_dist] [Rank 0] step=2241, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2241 loss: 0.0710 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 01:51:18,398] [INFO] [logging.py:107:log_dist] [Rank 0] step=2242, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2242 loss: 0.0430 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:51:29,070] [INFO] [logging.py:107:log_dist] [Rank 0] step=2243, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2243 loss: 0.0917 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:51:39,744] [INFO] [logging.py:107:log_dist] [Rank 0] step=2244, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2244 loss: 0.1048 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:51:50,411] [INFO] [logging.py:107:log_dist] [Rank 0] step=2245, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2245 loss: 0.0900 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 01:52:01,084] [INFO] [logging.py:107:log_dist] [Rank 0] step=2246, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2246 loss: 0.0914 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:52:11,764] [INFO] [logging.py:107:log_dist] [Rank 0] step=2247, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2247 loss: 0.0535 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:52:22,433] [INFO] [logging.py:107:log_dist] [Rank 0] step=2248, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2248 loss: 0.0878 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:52:33,306] [INFO] [logging.py:107:log_dist] [Rank 0] step=2249, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2249 loss: 0.1008 iter time (s): 10.842 samples/sec: 0.092 +[2025-05-06 01:52:43,977] [INFO] [logging.py:107:log_dist] [Rank 0] step=2250, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2250 loss: 0.2551 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:52:54,647] [INFO] [logging.py:107:log_dist] [Rank 0] step=2251, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2251 loss: 0.0648 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:53:05,317] [INFO] [logging.py:107:log_dist] [Rank 0] step=2252, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2252 loss: 0.0350 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:53:15,983] [INFO] [logging.py:107:log_dist] [Rank 0] step=2253, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2253 loss: 0.0505 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 01:53:26,651] [INFO] [logging.py:107:log_dist] [Rank 0] step=2254, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2254 loss: 0.0294 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:53:37,319] [INFO] [logging.py:107:log_dist] [Rank 0] step=2255, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2255 loss: 0.0716 iter time (s): 10.642 samples/sec: 0.094 +Started new epoch: 56 +[2025-05-06 01:53:48,324] [INFO] [logging.py:107:log_dist] [Rank 0] step=2256, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2256 loss: 0.0403 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:53:58,996] [INFO] [logging.py:107:log_dist] [Rank 0] step=2257, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2257 loss: 0.0433 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:54:09,862] [INFO] [logging.py:107:log_dist] [Rank 0] step=2258, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2258 loss: 0.0754 iter time (s): 10.835 samples/sec: 0.092 +[2025-05-06 01:54:20,532] [INFO] [logging.py:107:log_dist] [Rank 0] step=2259, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2259 loss: 0.0905 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:54:31,207] [INFO] [logging.py:107:log_dist] [Rank 0] step=2260, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2260 loss: 0.0426 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:54:41,878] [INFO] [logging.py:107:log_dist] [Rank 0] step=2261, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2261 loss: 0.1248 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:54:52,546] [INFO] [logging.py:107:log_dist] [Rank 0] step=2262, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2262 loss: 0.0674 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:55:03,215] [INFO] [logging.py:107:log_dist] [Rank 0] step=2263, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2263 loss: 0.1544 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:55:13,889] [INFO] [logging.py:107:log_dist] [Rank 0] step=2264, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2264 loss: 0.0445 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 01:55:24,560] [INFO] [logging.py:107:log_dist] [Rank 0] step=2265, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2265 loss: 0.2119 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:55:35,403] [INFO] [logging.py:107:log_dist] [Rank 0] step=2266, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2266 loss: 0.1124 iter time (s): 10.813 samples/sec: 0.092 +[2025-05-06 01:55:46,077] [INFO] [logging.py:107:log_dist] [Rank 0] step=2267, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2267 loss: 0.0273 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:55:56,751] [INFO] [logging.py:107:log_dist] [Rank 0] step=2268, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2268 loss: 0.0589 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 01:56:07,435] [INFO] [logging.py:107:log_dist] [Rank 0] step=2269, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2269 loss: 0.0927 iter time (s): 10.654 samples/sec: 0.094 +[2025-05-06 01:56:18,106] [INFO] [logging.py:107:log_dist] [Rank 0] step=2270, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2270 loss: 0.0295 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:56:28,775] [INFO] [logging.py:107:log_dist] [Rank 0] step=2271, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2271 loss: 0.0391 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:56:39,450] [INFO] [logging.py:107:log_dist] [Rank 0] step=2272, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2272 loss: 0.0333 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:56:50,123] [INFO] [logging.py:107:log_dist] [Rank 0] step=2273, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2273 loss: 0.1151 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:57:00,790] [INFO] [logging.py:107:log_dist] [Rank 0] step=2274, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2274 loss: 0.0812 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 01:57:11,632] [INFO] [logging.py:107:log_dist] [Rank 0] step=2275, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2275 loss: 0.0346 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-06 01:57:22,319] [INFO] [logging.py:107:log_dist] [Rank 0] step=2276, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2276 loss: 0.1242 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 01:57:33,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=2277, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2277 loss: 0.0563 iter time (s): 10.657 samples/sec: 0.094 +[2025-05-06 01:57:43,684] [INFO] [logging.py:107:log_dist] [Rank 0] step=2278, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2278 loss: 0.1309 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:57:54,355] [INFO] [logging.py:107:log_dist] [Rank 0] step=2279, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2279 loss: 0.2279 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:58:05,023] [INFO] [logging.py:107:log_dist] [Rank 0] step=2280, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2280 loss: 0.0509 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 01:58:15,700] [INFO] [logging.py:107:log_dist] [Rank 0] step=2281, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2281 loss: 0.0482 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 01:58:26,371] [INFO] [logging.py:107:log_dist] [Rank 0] step=2282, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2282 loss: 0.0304 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 01:58:37,043] [INFO] [logging.py:107:log_dist] [Rank 0] step=2283, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2283 loss: 0.0272 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:58:47,874] [INFO] [logging.py:107:log_dist] [Rank 0] step=2284, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2284 loss: 0.0283 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 01:58:58,543] [INFO] [logging.py:107:log_dist] [Rank 0] step=2285, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2285 loss: 0.0316 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 01:59:09,214] [INFO] [logging.py:107:log_dist] [Rank 0] step=2286, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2286 loss: 0.0389 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 01:59:19,890] [INFO] [logging.py:107:log_dist] [Rank 0] step=2287, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2287 loss: 0.0372 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 01:59:30,560] [INFO] [logging.py:107:log_dist] [Rank 0] step=2288, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2288 loss: 0.3275 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:59:41,229] [INFO] [logging.py:107:log_dist] [Rank 0] step=2289, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2289 loss: 0.0427 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 01:59:51,902] [INFO] [logging.py:107:log_dist] [Rank 0] step=2290, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2290 loss: 0.2778 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:00:02,586] [INFO] [logging.py:107:log_dist] [Rank 0] step=2291, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2291 loss: 0.0290 iter time (s): 10.654 samples/sec: 0.094 +[2025-05-06 02:00:13,443] [INFO] [logging.py:107:log_dist] [Rank 0] step=2292, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2292 loss: 0.0529 iter time (s): 10.826 samples/sec: 0.092 +[2025-05-06 02:00:24,116] [INFO] [logging.py:107:log_dist] [Rank 0] step=2293, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2293 loss: 0.1740 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:00:34,785] [INFO] [logging.py:107:log_dist] [Rank 0] step=2294, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2294 loss: 0.0477 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:00:45,457] [INFO] [logging.py:107:log_dist] [Rank 0] step=2295, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2295 loss: 0.0965 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:00:56,126] [INFO] [logging.py:107:log_dist] [Rank 0] step=2296, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2296 loss: 0.0558 iter time (s): 10.643 samples/sec: 0.094 +Started new epoch: 57 +[2025-05-06 02:01:07,141] [INFO] [logging.py:107:log_dist] [Rank 0] step=2297, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2297 loss: 0.1241 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:01:17,814] [INFO] [logging.py:107:log_dist] [Rank 0] step=2298, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2298 loss: 0.0323 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:01:28,494] [INFO] [logging.py:107:log_dist] [Rank 0] step=2299, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2299 loss: 0.1138 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 02:01:39,163] [INFO] [logging.py:107:log_dist] [Rank 0] step=2300, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2300 loss: 0.2435 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:01:49,992] [INFO] [logging.py:107:log_dist] [Rank 0] step=2301, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2301 loss: 0.2360 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-06 02:02:00,665] [INFO] [logging.py:107:log_dist] [Rank 0] step=2302, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2302 loss: 0.0897 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:02:11,339] [INFO] [logging.py:107:log_dist] [Rank 0] step=2303, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2303 loss: 0.0436 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:02:22,011] [INFO] [logging.py:107:log_dist] [Rank 0] step=2304, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2304 loss: 0.0478 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:02:32,689] [INFO] [logging.py:107:log_dist] [Rank 0] step=2305, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2305 loss: 0.1056 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 02:02:43,358] [INFO] [logging.py:107:log_dist] [Rank 0] step=2306, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2306 loss: 0.0346 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:02:54,029] [INFO] [logging.py:107:log_dist] [Rank 0] step=2307, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2307 loss: 0.0395 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:03:04,705] [INFO] [logging.py:107:log_dist] [Rank 0] step=2308, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2308 loss: 0.0639 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:03:15,381] [INFO] [logging.py:107:log_dist] [Rank 0] step=2309, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2309 loss: 0.0552 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:03:26,235] [INFO] [logging.py:107:log_dist] [Rank 0] step=2310, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2310 loss: 0.2642 iter time (s): 10.823 samples/sec: 0.092 +[2025-05-06 02:03:36,908] [INFO] [logging.py:107:log_dist] [Rank 0] step=2311, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2311 loss: 0.0581 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:03:47,581] [INFO] [logging.py:107:log_dist] [Rank 0] step=2312, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2312 loss: 0.0654 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:03:58,249] [INFO] [logging.py:107:log_dist] [Rank 0] step=2313, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2313 loss: 0.1036 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:04:08,925] [INFO] [logging.py:107:log_dist] [Rank 0] step=2314, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2314 loss: 0.2946 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:04:19,594] [INFO] [logging.py:107:log_dist] [Rank 0] step=2315, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2315 loss: 0.1609 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:04:30,262] [INFO] [logging.py:107:log_dist] [Rank 0] step=2316, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2316 loss: 0.0614 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:04:40,936] [INFO] [logging.py:107:log_dist] [Rank 0] step=2317, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2317 loss: 0.1729 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:04:51,773] [INFO] [logging.py:107:log_dist] [Rank 0] step=2318, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2318 loss: 0.0727 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-06 02:05:02,456] [INFO] [logging.py:107:log_dist] [Rank 0] step=2319, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2319 loss: 0.0744 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-06 02:05:13,136] [INFO] [logging.py:107:log_dist] [Rank 0] step=2320, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2320 loss: 0.0505 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:05:23,806] [INFO] [logging.py:107:log_dist] [Rank 0] step=2321, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2321 loss: 0.1060 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:05:34,477] [INFO] [logging.py:107:log_dist] [Rank 0] step=2322, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2322 loss: 0.0355 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:05:45,147] [INFO] [logging.py:107:log_dist] [Rank 0] step=2323, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2323 loss: 0.0506 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:05:55,817] [INFO] [logging.py:107:log_dist] [Rank 0] step=2324, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2324 loss: 0.0328 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:06:06,493] [INFO] [logging.py:107:log_dist] [Rank 0] step=2325, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2325 loss: 0.0307 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:06:17,169] [INFO] [logging.py:107:log_dist] [Rank 0] step=2326, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2326 loss: 0.1054 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:06:28,004] [INFO] [logging.py:107:log_dist] [Rank 0] step=2327, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2327 loss: 0.2652 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-06 02:06:38,690] [INFO] [logging.py:107:log_dist] [Rank 0] step=2328, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2328 loss: 0.0937 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-06 02:06:49,361] [INFO] [logging.py:107:log_dist] [Rank 0] step=2329, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2329 loss: 0.4921 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:07:00,029] [INFO] [logging.py:107:log_dist] [Rank 0] step=2330, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2330 loss: 0.0304 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:07:10,703] [INFO] [logging.py:107:log_dist] [Rank 0] step=2331, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2331 loss: 0.0694 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:07:21,375] [INFO] [logging.py:107:log_dist] [Rank 0] step=2332, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2332 loss: 0.0422 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:07:32,043] [INFO] [logging.py:107:log_dist] [Rank 0] step=2333, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2333 loss: 0.0753 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:07:42,714] [INFO] [logging.py:107:log_dist] [Rank 0] step=2334, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2334 loss: 0.0307 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:07:53,598] [INFO] [logging.py:107:log_dist] [Rank 0] step=2335, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2335 loss: 0.1554 iter time (s): 10.853 samples/sec: 0.092 +[2025-05-06 02:08:04,265] [INFO] [logging.py:107:log_dist] [Rank 0] step=2336, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2336 loss: 0.0317 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 02:08:14,934] [INFO] [logging.py:107:log_dist] [Rank 0] step=2337, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2337 loss: 0.1206 iter time (s): 10.643 samples/sec: 0.094 +Started new epoch: 58 +[2025-05-06 02:08:25,953] [INFO] [logging.py:107:log_dist] [Rank 0] step=2338, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2338 loss: 0.0329 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:08:36,624] [INFO] [logging.py:107:log_dist] [Rank 0] step=2339, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2339 loss: 0.0665 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:08:47,295] [INFO] [logging.py:107:log_dist] [Rank 0] step=2340, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2340 loss: 0.0551 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:08:57,970] [INFO] [logging.py:107:log_dist] [Rank 0] step=2341, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2341 loss: 0.0351 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:09:08,647] [INFO] [logging.py:107:log_dist] [Rank 0] step=2342, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2342 loss: 0.1693 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 02:09:19,321] [INFO] [logging.py:107:log_dist] [Rank 0] step=2343, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2343 loss: 0.0309 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:09:30,151] [INFO] [logging.py:107:log_dist] [Rank 0] step=2344, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2344 loss: 0.0666 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 02:09:40,822] [INFO] [logging.py:107:log_dist] [Rank 0] step=2345, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2345 loss: 0.1683 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:09:51,492] [INFO] [logging.py:107:log_dist] [Rank 0] step=2346, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2346 loss: 0.5451 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:10:02,164] [INFO] [logging.py:107:log_dist] [Rank 0] step=2347, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2347 loss: 0.0351 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:10:12,838] [INFO] [logging.py:107:log_dist] [Rank 0] step=2348, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2348 loss: 0.0505 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:10:23,510] [INFO] [logging.py:107:log_dist] [Rank 0] step=2349, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2349 loss: 0.0705 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:10:34,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=2350, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2350 loss: 0.0465 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:10:44,850] [INFO] [logging.py:107:log_dist] [Rank 0] step=2351, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2351 loss: 0.1046 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:10:55,526] [INFO] [logging.py:107:log_dist] [Rank 0] step=2352, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2352 loss: 0.0967 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:11:06,361] [INFO] [logging.py:107:log_dist] [Rank 0] step=2353, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2353 loss: 0.1468 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-06 02:11:17,030] [INFO] [logging.py:107:log_dist] [Rank 0] step=2354, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2354 loss: 0.1194 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:11:27,703] [INFO] [logging.py:107:log_dist] [Rank 0] step=2355, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2355 loss: 0.0383 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:11:38,374] [INFO] [logging.py:107:log_dist] [Rank 0] step=2356, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2356 loss: 0.0541 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:11:49,041] [INFO] [logging.py:107:log_dist] [Rank 0] step=2357, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2357 loss: 0.0639 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 02:11:59,713] [INFO] [logging.py:107:log_dist] [Rank 0] step=2358, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2358 loss: 0.0563 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:12:10,386] [INFO] [logging.py:107:log_dist] [Rank 0] step=2359, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2359 loss: 0.0766 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:12:21,060] [INFO] [logging.py:107:log_dist] [Rank 0] step=2360, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2360 loss: 0.0367 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 02:12:31,734] [INFO] [logging.py:107:log_dist] [Rank 0] step=2361, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2361 loss: 0.1821 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:12:42,565] [INFO] [logging.py:107:log_dist] [Rank 0] step=2362, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2362 loss: 0.0450 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-06 02:12:53,237] [INFO] [logging.py:107:log_dist] [Rank 0] step=2363, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2363 loss: 0.0291 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:13:03,908] [INFO] [logging.py:107:log_dist] [Rank 0] step=2364, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2364 loss: 0.1956 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:13:14,577] [INFO] [logging.py:107:log_dist] [Rank 0] step=2365, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2365 loss: 0.0447 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:13:25,250] [INFO] [logging.py:107:log_dist] [Rank 0] step=2366, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2366 loss: 0.0645 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:13:35,923] [INFO] [logging.py:107:log_dist] [Rank 0] step=2367, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2367 loss: 0.1026 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:13:46,591] [INFO] [logging.py:107:log_dist] [Rank 0] step=2368, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2368 loss: 0.3403 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:13:57,262] [INFO] [logging.py:107:log_dist] [Rank 0] step=2369, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2369 loss: 0.0968 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:14:08,128] [INFO] [logging.py:107:log_dist] [Rank 0] step=2370, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2370 loss: 0.0499 iter time (s): 10.834 samples/sec: 0.092 +[2025-05-06 02:14:18,798] [INFO] [logging.py:107:log_dist] [Rank 0] step=2371, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2371 loss: 0.0370 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:14:29,466] [INFO] [logging.py:107:log_dist] [Rank 0] step=2372, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2372 loss: 0.1267 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:14:40,143] [INFO] [logging.py:107:log_dist] [Rank 0] step=2373, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2373 loss: 0.1116 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:14:50,815] [INFO] [logging.py:107:log_dist] [Rank 0] step=2374, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2374 loss: 0.2115 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:15:01,494] [INFO] [logging.py:107:log_dist] [Rank 0] step=2375, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2375 loss: 0.0436 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 02:15:12,167] [INFO] [logging.py:107:log_dist] [Rank 0] step=2376, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2376 loss: 0.5933 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:15:22,834] [INFO] [logging.py:107:log_dist] [Rank 0] step=2377, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2377 loss: 0.0922 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:15:33,506] [INFO] [logging.py:107:log_dist] [Rank 0] step=2378, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2378 loss: 0.0625 iter time (s): 10.645 samples/sec: 0.094 +Started new epoch: 59 +[2025-05-06 02:15:44,733] [INFO] [logging.py:107:log_dist] [Rank 0] step=2379, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2379 loss: 0.0341 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-06 02:15:55,411] [INFO] [logging.py:107:log_dist] [Rank 0] step=2380, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2380 loss: 0.1291 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 02:16:06,083] [INFO] [logging.py:107:log_dist] [Rank 0] step=2381, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2381 loss: 0.0323 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:16:16,752] [INFO] [logging.py:107:log_dist] [Rank 0] step=2382, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2382 loss: 0.0401 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:16:27,422] [INFO] [logging.py:107:log_dist] [Rank 0] step=2383, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2383 loss: 0.1327 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:16:38,095] [INFO] [logging.py:107:log_dist] [Rank 0] step=2384, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2384 loss: 0.1825 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:16:48,766] [INFO] [logging.py:107:log_dist] [Rank 0] step=2385, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2385 loss: 0.1116 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:16:59,435] [INFO] [logging.py:107:log_dist] [Rank 0] step=2386, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2386 loss: 0.1030 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:17:10,111] [INFO] [logging.py:107:log_dist] [Rank 0] step=2387, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2387 loss: 0.0477 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:17:20,944] [INFO] [logging.py:107:log_dist] [Rank 0] step=2388, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2388 loss: 0.1701 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-06 02:17:31,613] [INFO] [logging.py:107:log_dist] [Rank 0] step=2389, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2389 loss: 0.1348 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:17:42,285] [INFO] [logging.py:107:log_dist] [Rank 0] step=2390, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2390 loss: 0.0749 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:17:52,959] [INFO] [logging.py:107:log_dist] [Rank 0] step=2391, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2391 loss: 0.0826 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:18:03,629] [INFO] [logging.py:107:log_dist] [Rank 0] step=2392, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2392 loss: 0.0369 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:18:14,302] [INFO] [logging.py:107:log_dist] [Rank 0] step=2393, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2393 loss: 0.0384 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:18:24,971] [INFO] [logging.py:107:log_dist] [Rank 0] step=2394, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2394 loss: 0.0999 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:18:35,641] [INFO] [logging.py:107:log_dist] [Rank 0] step=2395, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2395 loss: 0.0739 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:18:46,502] [INFO] [logging.py:107:log_dist] [Rank 0] step=2396, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2396 loss: 0.1717 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-06 02:18:57,174] [INFO] [logging.py:107:log_dist] [Rank 0] step=2397, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2397 loss: 0.0393 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:19:07,844] [INFO] [logging.py:107:log_dist] [Rank 0] step=2398, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2398 loss: 0.0491 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:19:18,516] [INFO] [logging.py:107:log_dist] [Rank 0] step=2399, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2399 loss: 0.0513 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:19:29,189] [INFO] [logging.py:107:log_dist] [Rank 0] step=2400, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2400 loss: 0.0584 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:19:39,862] [INFO] [logging.py:107:log_dist] [Rank 0] step=2401, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2401 loss: 0.0702 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:19:50,534] [INFO] [logging.py:107:log_dist] [Rank 0] step=2402, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2402 loss: 0.0906 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:20:01,207] [INFO] [logging.py:107:log_dist] [Rank 0] step=2403, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2403 loss: 0.1393 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:20:11,878] [INFO] [logging.py:107:log_dist] [Rank 0] step=2404, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2404 loss: 0.0269 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:20:22,736] [INFO] [logging.py:107:log_dist] [Rank 0] step=2405, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2405 loss: 0.0835 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 02:20:33,410] [INFO] [logging.py:107:log_dist] [Rank 0] step=2406, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2406 loss: 0.0577 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:20:44,078] [INFO] [logging.py:107:log_dist] [Rank 0] step=2407, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2407 loss: 0.1061 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 02:20:54,750] [INFO] [logging.py:107:log_dist] [Rank 0] step=2408, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2408 loss: 0.0738 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:21:05,421] [INFO] [logging.py:107:log_dist] [Rank 0] step=2409, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2409 loss: 0.0290 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:21:16,091] [INFO] [logging.py:107:log_dist] [Rank 0] step=2410, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2410 loss: 0.2608 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:21:26,764] [INFO] [logging.py:107:log_dist] [Rank 0] step=2411, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2411 loss: 0.0507 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:21:37,434] [INFO] [logging.py:107:log_dist] [Rank 0] step=2412, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2412 loss: 0.0457 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:21:48,112] [INFO] [logging.py:107:log_dist] [Rank 0] step=2413, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2413 loss: 0.0431 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:21:58,942] [INFO] [logging.py:107:log_dist] [Rank 0] step=2414, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2414 loss: 0.0435 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-06 02:22:09,612] [INFO] [logging.py:107:log_dist] [Rank 0] step=2415, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2415 loss: 0.1431 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:22:20,280] [INFO] [logging.py:107:log_dist] [Rank 0] step=2416, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2416 loss: 0.1125 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:22:30,952] [INFO] [logging.py:107:log_dist] [Rank 0] step=2417, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2417 loss: 0.1368 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:22:41,620] [INFO] [logging.py:107:log_dist] [Rank 0] step=2418, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2418 loss: 0.1684 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:22:52,283] [INFO] [logging.py:107:log_dist] [Rank 0] step=2419, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2419 loss: 0.0452 iter time (s): 10.636 samples/sec: 0.094 +Started new epoch: 60 +[2025-05-06 02:23:03,304] [INFO] [logging.py:107:log_dist] [Rank 0] step=2420, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2420 loss: 0.0645 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 02:23:13,979] [INFO] [logging.py:107:log_dist] [Rank 0] step=2421, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2421 loss: 0.1649 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:23:24,822] [INFO] [logging.py:107:log_dist] [Rank 0] step=2422, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2422 loss: 0.1428 iter time (s): 10.813 samples/sec: 0.092 +[2025-05-06 02:23:35,501] [INFO] [logging.py:107:log_dist] [Rank 0] step=2423, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2423 loss: 0.0777 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 02:23:46,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=2424, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2424 loss: 0.0911 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:23:56,848] [INFO] [logging.py:107:log_dist] [Rank 0] step=2425, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2425 loss: 0.1399 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:24:07,520] [INFO] [logging.py:107:log_dist] [Rank 0] step=2426, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2426 loss: 0.2346 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:24:18,187] [INFO] [logging.py:107:log_dist] [Rank 0] step=2427, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2427 loss: 0.0502 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 02:24:28,855] [INFO] [logging.py:107:log_dist] [Rank 0] step=2428, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2428 loss: 0.0542 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:24:39,531] [INFO] [logging.py:107:log_dist] [Rank 0] step=2429, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2429 loss: 0.0329 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:24:50,199] [INFO] [logging.py:107:log_dist] [Rank 0] step=2430, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2430 loss: 0.1882 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:25:01,029] [INFO] [logging.py:107:log_dist] [Rank 0] step=2431, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2431 loss: 0.0540 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 02:25:11,702] [INFO] [logging.py:107:log_dist] [Rank 0] step=2432, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2432 loss: 0.0309 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:25:22,370] [INFO] [logging.py:107:log_dist] [Rank 0] step=2433, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2433 loss: 0.0286 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:25:33,050] [INFO] [logging.py:107:log_dist] [Rank 0] step=2434, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2434 loss: 0.0878 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-06 02:25:43,720] [INFO] [logging.py:107:log_dist] [Rank 0] step=2435, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2435 loss: 0.1015 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:25:54,388] [INFO] [logging.py:107:log_dist] [Rank 0] step=2436, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2436 loss: 0.0336 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:26:05,063] [INFO] [logging.py:107:log_dist] [Rank 0] step=2437, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2437 loss: 0.0578 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:26:15,741] [INFO] [logging.py:107:log_dist] [Rank 0] step=2438, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2438 loss: 0.0933 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 02:26:26,602] [INFO] [logging.py:107:log_dist] [Rank 0] step=2439, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2439 loss: 0.1113 iter time (s): 10.831 samples/sec: 0.092 +[2025-05-06 02:26:37,279] [INFO] [logging.py:107:log_dist] [Rank 0] step=2440, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2440 loss: 0.0308 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 02:26:47,952] [INFO] [logging.py:107:log_dist] [Rank 0] step=2441, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2441 loss: 0.0512 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:26:58,618] [INFO] [logging.py:107:log_dist] [Rank 0] step=2442, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2442 loss: 0.0459 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 02:27:09,294] [INFO] [logging.py:107:log_dist] [Rank 0] step=2443, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2443 loss: 0.0529 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:27:19,964] [INFO] [logging.py:107:log_dist] [Rank 0] step=2444, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2444 loss: 0.0318 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:27:30,638] [INFO] [logging.py:107:log_dist] [Rank 0] step=2445, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2445 loss: 0.0260 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:27:41,310] [INFO] [logging.py:107:log_dist] [Rank 0] step=2446, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2446 loss: 0.0639 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:27:51,979] [INFO] [logging.py:107:log_dist] [Rank 0] step=2447, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2447 loss: 0.1018 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:28:02,836] [INFO] [logging.py:107:log_dist] [Rank 0] step=2448, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2448 loss: 0.0328 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 02:28:13,508] [INFO] [logging.py:107:log_dist] [Rank 0] step=2449, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2449 loss: 0.1373 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:28:24,176] [INFO] [logging.py:107:log_dist] [Rank 0] step=2450, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2450 loss: 0.0657 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 02:28:34,845] [INFO] [logging.py:107:log_dist] [Rank 0] step=2451, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2451 loss: 0.0370 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:28:45,515] [INFO] [logging.py:107:log_dist] [Rank 0] step=2452, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2452 loss: 0.0366 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:28:56,188] [INFO] [logging.py:107:log_dist] [Rank 0] step=2453, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2453 loss: 0.0393 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:29:06,857] [INFO] [logging.py:107:log_dist] [Rank 0] step=2454, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2454 loss: 0.0703 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:29:17,534] [INFO] [logging.py:107:log_dist] [Rank 0] step=2455, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2455 loss: 0.0464 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:29:28,200] [INFO] [logging.py:107:log_dist] [Rank 0] step=2456, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2456 loss: 0.0338 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 02:29:39,028] [INFO] [logging.py:107:log_dist] [Rank 0] step=2457, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2457 loss: 0.0294 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-06 02:29:49,697] [INFO] [logging.py:107:log_dist] [Rank 0] step=2458, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2458 loss: 0.2000 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:30:00,364] [INFO] [logging.py:107:log_dist] [Rank 0] step=2459, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2459 loss: 0.0659 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:30:11,050] [INFO] [logging.py:107:log_dist] [Rank 0] step=2460, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2460 loss: 0.0712 iter time (s): 10.660 samples/sec: 0.094 +Saving model to directory epoch60 +Started new epoch: 61 +[2025-05-06 02:30:23,615] [INFO] [logging.py:107:log_dist] [Rank 0] step=2461, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2461 loss: 0.1410 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 02:30:34,284] [INFO] [logging.py:107:log_dist] [Rank 0] step=2462, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2462 loss: 0.1350 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:30:44,954] [INFO] [logging.py:107:log_dist] [Rank 0] step=2463, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2463 loss: 0.0928 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:30:55,627] [INFO] [logging.py:107:log_dist] [Rank 0] step=2464, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2464 loss: 0.0949 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:31:06,456] [INFO] [logging.py:107:log_dist] [Rank 0] step=2465, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2465 loss: 0.0413 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 02:31:17,128] [INFO] [logging.py:107:log_dist] [Rank 0] step=2466, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2466 loss: 0.0580 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:31:27,803] [INFO] [logging.py:107:log_dist] [Rank 0] step=2467, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2467 loss: 0.0582 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:31:38,476] [INFO] [logging.py:107:log_dist] [Rank 0] step=2468, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2468 loss: 0.0288 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:31:49,148] [INFO] [logging.py:107:log_dist] [Rank 0] step=2469, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2469 loss: 0.3200 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:31:59,822] [INFO] [logging.py:107:log_dist] [Rank 0] step=2470, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2470 loss: 0.1467 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:32:10,497] [INFO] [logging.py:107:log_dist] [Rank 0] step=2471, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2471 loss: 0.2161 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:32:21,166] [INFO] [logging.py:107:log_dist] [Rank 0] step=2472, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2472 loss: 0.0458 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:32:32,034] [INFO] [logging.py:107:log_dist] [Rank 0] step=2473, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2473 loss: 0.0484 iter time (s): 10.836 samples/sec: 0.092 +[2025-05-06 02:32:42,708] [INFO] [logging.py:107:log_dist] [Rank 0] step=2474, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2474 loss: 0.0381 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:32:53,379] [INFO] [logging.py:107:log_dist] [Rank 0] step=2475, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2475 loss: 0.0546 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:33:04,053] [INFO] [logging.py:107:log_dist] [Rank 0] step=2476, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2476 loss: 0.0308 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:33:14,723] [INFO] [logging.py:107:log_dist] [Rank 0] step=2477, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2477 loss: 0.0274 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:33:25,400] [INFO] [logging.py:107:log_dist] [Rank 0] step=2478, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2478 loss: 0.0644 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 02:33:36,079] [INFO] [logging.py:107:log_dist] [Rank 0] step=2479, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2479 loss: 0.0956 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 02:33:46,753] [INFO] [logging.py:107:log_dist] [Rank 0] step=2480, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2480 loss: 0.0763 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:33:57,436] [INFO] [logging.py:107:log_dist] [Rank 0] step=2481, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2481 loss: 0.0412 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-06 02:34:08,268] [INFO] [logging.py:107:log_dist] [Rank 0] step=2482, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2482 loss: 0.0499 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 02:34:18,941] [INFO] [logging.py:107:log_dist] [Rank 0] step=2483, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2483 loss: 0.0361 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:34:29,612] [INFO] [logging.py:107:log_dist] [Rank 0] step=2484, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2484 loss: 0.0562 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:34:40,287] [INFO] [logging.py:107:log_dist] [Rank 0] step=2485, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2485 loss: 0.0410 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:34:50,957] [INFO] [logging.py:107:log_dist] [Rank 0] step=2486, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2486 loss: 0.0372 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:35:01,639] [INFO] [logging.py:107:log_dist] [Rank 0] step=2487, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2487 loss: 0.0285 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 02:35:12,315] [INFO] [logging.py:107:log_dist] [Rank 0] step=2488, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2488 loss: 0.0273 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:35:22,987] [INFO] [logging.py:107:log_dist] [Rank 0] step=2489, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2489 loss: 0.0706 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:35:33,658] [INFO] [logging.py:107:log_dist] [Rank 0] step=2490, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2490 loss: 0.1012 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:35:44,518] [INFO] [logging.py:107:log_dist] [Rank 0] step=2491, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2491 loss: 0.1929 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 02:35:55,185] [INFO] [logging.py:107:log_dist] [Rank 0] step=2492, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2492 loss: 0.0426 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 02:36:05,859] [INFO] [logging.py:107:log_dist] [Rank 0] step=2493, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2493 loss: 0.0363 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:36:16,529] [INFO] [logging.py:107:log_dist] [Rank 0] step=2494, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2494 loss: 0.0566 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:36:27,201] [INFO] [logging.py:107:log_dist] [Rank 0] step=2495, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2495 loss: 0.2683 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:36:37,875] [INFO] [logging.py:107:log_dist] [Rank 0] step=2496, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2496 loss: 0.0424 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:36:48,547] [INFO] [logging.py:107:log_dist] [Rank 0] step=2497, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2497 loss: 0.0830 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:36:59,216] [INFO] [logging.py:107:log_dist] [Rank 0] step=2498, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2498 loss: 0.0488 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:37:10,050] [INFO] [logging.py:107:log_dist] [Rank 0] step=2499, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2499 loss: 0.0732 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-06 02:37:20,717] [INFO] [logging.py:107:log_dist] [Rank 0] step=2500, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2500 loss: 0.0766 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:37:31,388] [INFO] [logging.py:107:log_dist] [Rank 0] step=2501, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2501 loss: 0.0513 iter time (s): 10.644 samples/sec: 0.094 +Started new epoch: 62 +[2025-05-06 02:37:42,395] [INFO] [logging.py:107:log_dist] [Rank 0] step=2502, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2502 loss: 0.2173 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:37:53,067] [INFO] [logging.py:107:log_dist] [Rank 0] step=2503, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2503 loss: 0.0623 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:38:03,740] [INFO] [logging.py:107:log_dist] [Rank 0] step=2504, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2504 loss: 0.0881 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:38:14,412] [INFO] [logging.py:107:log_dist] [Rank 0] step=2505, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2505 loss: 0.0329 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:38:25,083] [INFO] [logging.py:107:log_dist] [Rank 0] step=2506, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2506 loss: 0.0938 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:38:35,752] [INFO] [logging.py:107:log_dist] [Rank 0] step=2507, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2507 loss: 0.0423 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:38:46,596] [INFO] [logging.py:107:log_dist] [Rank 0] step=2508, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2508 loss: 0.1013 iter time (s): 10.814 samples/sec: 0.092 +[2025-05-06 02:38:57,283] [INFO] [logging.py:107:log_dist] [Rank 0] step=2509, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2509 loss: 0.0496 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:39:07,956] [INFO] [logging.py:107:log_dist] [Rank 0] step=2510, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2510 loss: 0.1185 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:39:18,631] [INFO] [logging.py:107:log_dist] [Rank 0] step=2511, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2511 loss: 0.0740 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:39:29,301] [INFO] [logging.py:107:log_dist] [Rank 0] step=2512, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2512 loss: 0.0559 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:39:39,970] [INFO] [logging.py:107:log_dist] [Rank 0] step=2513, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2513 loss: 0.0324 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:39:50,641] [INFO] [logging.py:107:log_dist] [Rank 0] step=2514, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2514 loss: 0.1049 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:40:01,311] [INFO] [logging.py:107:log_dist] [Rank 0] step=2515, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2515 loss: 0.0278 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:40:12,193] [INFO] [logging.py:107:log_dist] [Rank 0] step=2516, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2516 loss: 0.0268 iter time (s): 10.852 samples/sec: 0.092 +[2025-05-06 02:40:22,867] [INFO] [logging.py:107:log_dist] [Rank 0] step=2517, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2517 loss: 0.0287 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:40:33,539] [INFO] [logging.py:107:log_dist] [Rank 0] step=2518, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2518 loss: 0.0656 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:40:44,218] [INFO] [logging.py:107:log_dist] [Rank 0] step=2519, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2519 loss: 0.0970 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 02:40:54,891] [INFO] [logging.py:107:log_dist] [Rank 0] step=2520, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2520 loss: 0.0273 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:41:05,564] [INFO] [logging.py:107:log_dist] [Rank 0] step=2521, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2521 loss: 0.1908 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:41:16,238] [INFO] [logging.py:107:log_dist] [Rank 0] step=2522, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2522 loss: 0.1506 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:41:26,914] [INFO] [logging.py:107:log_dist] [Rank 0] step=2523, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2523 loss: 0.0523 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:41:37,583] [INFO] [logging.py:107:log_dist] [Rank 0] step=2524, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2524 loss: 0.0605 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:41:48,414] [INFO] [logging.py:107:log_dist] [Rank 0] step=2525, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2525 loss: 0.0477 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-06 02:41:59,090] [INFO] [logging.py:107:log_dist] [Rank 0] step=2526, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2526 loss: 0.0637 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:42:09,761] [INFO] [logging.py:107:log_dist] [Rank 0] step=2527, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2527 loss: 0.0575 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:42:20,432] [INFO] [logging.py:107:log_dist] [Rank 0] step=2528, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2528 loss: 0.0257 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:42:31,104] [INFO] [logging.py:107:log_dist] [Rank 0] step=2529, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2529 loss: 0.1061 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:42:41,773] [INFO] [logging.py:107:log_dist] [Rank 0] step=2530, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2530 loss: 0.0272 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:42:52,444] [INFO] [logging.py:107:log_dist] [Rank 0] step=2531, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2531 loss: 0.1830 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:43:03,124] [INFO] [logging.py:107:log_dist] [Rank 0] step=2532, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2532 loss: 0.8287 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 02:43:13,799] [INFO] [logging.py:107:log_dist] [Rank 0] step=2533, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2533 loss: 0.0741 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:43:24,630] [INFO] [logging.py:107:log_dist] [Rank 0] step=2534, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2534 loss: 0.0484 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 02:43:35,301] [INFO] [logging.py:107:log_dist] [Rank 0] step=2535, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2535 loss: 0.0679 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:43:45,979] [INFO] [logging.py:107:log_dist] [Rank 0] step=2536, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2536 loss: 0.0289 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 02:43:56,660] [INFO] [logging.py:107:log_dist] [Rank 0] step=2537, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2537 loss: 0.2675 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 02:44:07,336] [INFO] [logging.py:107:log_dist] [Rank 0] step=2538, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2538 loss: 0.0542 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:44:18,005] [INFO] [logging.py:107:log_dist] [Rank 0] step=2539, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2539 loss: 0.0447 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:44:28,680] [INFO] [logging.py:107:log_dist] [Rank 0] step=2540, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2540 loss: 0.0949 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:44:39,350] [INFO] [logging.py:107:log_dist] [Rank 0] step=2541, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2541 loss: 0.0406 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:44:50,021] [INFO] [logging.py:107:log_dist] [Rank 0] step=2542, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2542 loss: 0.0974 iter time (s): 10.644 samples/sec: 0.094 +Started new epoch: 63 +[2025-05-06 02:45:01,190] [INFO] [logging.py:107:log_dist] [Rank 0] step=2543, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2543 loss: 0.0536 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-06 02:45:11,861] [INFO] [logging.py:107:log_dist] [Rank 0] step=2544, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2544 loss: 0.0376 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:45:22,533] [INFO] [logging.py:107:log_dist] [Rank 0] step=2545, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2545 loss: 0.1302 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:45:33,208] [INFO] [logging.py:107:log_dist] [Rank 0] step=2546, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2546 loss: 0.0769 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:45:43,879] [INFO] [logging.py:107:log_dist] [Rank 0] step=2547, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2547 loss: 0.1859 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:45:54,554] [INFO] [logging.py:107:log_dist] [Rank 0] step=2548, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2548 loss: 0.1453 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:46:05,229] [INFO] [logging.py:107:log_dist] [Rank 0] step=2549, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2549 loss: 0.0603 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:46:15,900] [INFO] [logging.py:107:log_dist] [Rank 0] step=2550, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2550 loss: 0.0376 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:46:26,794] [INFO] [logging.py:107:log_dist] [Rank 0] step=2551, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2551 loss: 0.0574 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 02:46:37,471] [INFO] [logging.py:107:log_dist] [Rank 0] step=2552, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2552 loss: 0.1058 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 02:46:48,145] [INFO] [logging.py:107:log_dist] [Rank 0] step=2553, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2553 loss: 0.0477 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:46:58,816] [INFO] [logging.py:107:log_dist] [Rank 0] step=2554, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2554 loss: 0.0278 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:47:09,507] [INFO] [logging.py:107:log_dist] [Rank 0] step=2555, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2555 loss: 0.0565 iter time (s): 10.660 samples/sec: 0.094 +[2025-05-06 02:47:20,183] [INFO] [logging.py:107:log_dist] [Rank 0] step=2556, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2556 loss: 0.0921 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:47:30,859] [INFO] [logging.py:107:log_dist] [Rank 0] step=2557, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2557 loss: 0.0522 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:47:41,531] [INFO] [logging.py:107:log_dist] [Rank 0] step=2558, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2558 loss: 0.0288 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:47:52,203] [INFO] [logging.py:107:log_dist] [Rank 0] step=2559, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2559 loss: 0.1175 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:48:03,060] [INFO] [logging.py:107:log_dist] [Rank 0] step=2560, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2560 loss: 0.0481 iter time (s): 10.826 samples/sec: 0.092 +[2025-05-06 02:48:13,743] [INFO] [logging.py:107:log_dist] [Rank 0] step=2561, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2561 loss: 0.1187 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:48:24,418] [INFO] [logging.py:107:log_dist] [Rank 0] step=2562, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2562 loss: 0.0360 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:48:35,089] [INFO] [logging.py:107:log_dist] [Rank 0] step=2563, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2563 loss: 0.0501 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:48:45,767] [INFO] [logging.py:107:log_dist] [Rank 0] step=2564, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2564 loss: 0.0340 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 02:48:56,440] [INFO] [logging.py:107:log_dist] [Rank 0] step=2565, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2565 loss: 0.0478 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:49:07,132] [INFO] [logging.py:107:log_dist] [Rank 0] step=2566, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2566 loss: 0.0391 iter time (s): 10.655 samples/sec: 0.094 +[2025-05-06 02:49:17,804] [INFO] [logging.py:107:log_dist] [Rank 0] step=2567, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2567 loss: 0.0309 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:49:28,483] [INFO] [logging.py:107:log_dist] [Rank 0] step=2568, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2568 loss: 0.0269 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 02:49:39,321] [INFO] [logging.py:107:log_dist] [Rank 0] step=2569, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2569 loss: 0.3367 iter time (s): 10.807 samples/sec: 0.093 +[2025-05-06 02:49:49,993] [INFO] [logging.py:107:log_dist] [Rank 0] step=2570, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2570 loss: 0.0311 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:50:00,660] [INFO] [logging.py:107:log_dist] [Rank 0] step=2571, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2571 loss: 0.0269 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:50:11,335] [INFO] [logging.py:107:log_dist] [Rank 0] step=2572, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2572 loss: 0.0382 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:50:22,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=2573, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2573 loss: 0.0642 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:50:32,680] [INFO] [logging.py:107:log_dist] [Rank 0] step=2574, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2574 loss: 0.0848 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:50:43,355] [INFO] [logging.py:107:log_dist] [Rank 0] step=2575, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2575 loss: 0.0838 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:50:54,029] [INFO] [logging.py:107:log_dist] [Rank 0] step=2576, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2576 loss: 0.0383 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:51:04,887] [INFO] [logging.py:107:log_dist] [Rank 0] step=2577, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2577 loss: 0.0263 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 02:51:15,563] [INFO] [logging.py:107:log_dist] [Rank 0] step=2578, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2578 loss: 0.0485 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:51:26,243] [INFO] [logging.py:107:log_dist] [Rank 0] step=2579, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2579 loss: 0.1932 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 02:51:36,914] [INFO] [logging.py:107:log_dist] [Rank 0] step=2580, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2580 loss: 0.0280 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:51:47,584] [INFO] [logging.py:107:log_dist] [Rank 0] step=2581, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2581 loss: 0.1274 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:51:58,271] [INFO] [logging.py:107:log_dist] [Rank 0] step=2582, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2582 loss: 0.1440 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 02:52:08,944] [INFO] [logging.py:107:log_dist] [Rank 0] step=2583, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2583 loss: 0.1010 iter time (s): 10.646 samples/sec: 0.094 +Started new epoch: 64 +[2025-05-06 02:52:19,957] [INFO] [logging.py:107:log_dist] [Rank 0] step=2584, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2584 loss: 0.0924 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:52:30,633] [INFO] [logging.py:107:log_dist] [Rank 0] step=2585, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2585 loss: 0.1187 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:52:41,493] [INFO] [logging.py:107:log_dist] [Rank 0] step=2586, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2586 loss: 0.0609 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 02:52:52,162] [INFO] [logging.py:107:log_dist] [Rank 0] step=2587, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2587 loss: 0.0839 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:53:02,837] [INFO] [logging.py:107:log_dist] [Rank 0] step=2588, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2588 loss: 0.1357 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:53:13,507] [INFO] [logging.py:107:log_dist] [Rank 0] step=2589, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2589 loss: 0.0274 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:53:24,176] [INFO] [logging.py:107:log_dist] [Rank 0] step=2590, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2590 loss: 0.0388 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:53:34,848] [INFO] [logging.py:107:log_dist] [Rank 0] step=2591, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2591 loss: 0.0906 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:53:45,531] [INFO] [logging.py:107:log_dist] [Rank 0] step=2592, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2592 loss: 0.1623 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:53:56,209] [INFO] [logging.py:107:log_dist] [Rank 0] step=2593, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2593 loss: 0.0489 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 02:54:06,880] [INFO] [logging.py:107:log_dist] [Rank 0] step=2594, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2594 loss: 0.0631 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:54:17,708] [INFO] [logging.py:107:log_dist] [Rank 0] step=2595, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2595 loss: 0.3414 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-06 02:54:28,381] [INFO] [logging.py:107:log_dist] [Rank 0] step=2596, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2596 loss: 0.0767 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 02:54:39,047] [INFO] [logging.py:107:log_dist] [Rank 0] step=2597, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2597 loss: 0.0276 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 02:54:49,722] [INFO] [logging.py:107:log_dist] [Rank 0] step=2598, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2598 loss: 0.0588 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:55:00,399] [INFO] [logging.py:107:log_dist] [Rank 0] step=2599, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2599 loss: 0.0348 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:55:11,072] [INFO] [logging.py:107:log_dist] [Rank 0] step=2600, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2600 loss: 0.0462 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:55:21,747] [INFO] [logging.py:107:log_dist] [Rank 0] step=2601, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2601 loss: 0.0810 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:55:32,424] [INFO] [logging.py:107:log_dist] [Rank 0] step=2602, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2602 loss: 0.1245 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 02:55:43,278] [INFO] [logging.py:107:log_dist] [Rank 0] step=2603, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2603 loss: 0.0374 iter time (s): 10.819 samples/sec: 0.092 +[2025-05-06 02:55:53,950] [INFO] [logging.py:107:log_dist] [Rank 0] step=2604, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2604 loss: 0.0485 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:56:04,626] [INFO] [logging.py:107:log_dist] [Rank 0] step=2605, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2605 loss: 0.0405 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 02:56:15,298] [INFO] [logging.py:107:log_dist] [Rank 0] step=2606, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2606 loss: 0.0798 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:56:25,970] [INFO] [logging.py:107:log_dist] [Rank 0] step=2607, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2607 loss: 0.0328 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:56:36,642] [INFO] [logging.py:107:log_dist] [Rank 0] step=2608, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2608 loss: 0.0290 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:56:47,314] [INFO] [logging.py:107:log_dist] [Rank 0] step=2609, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2609 loss: 0.1958 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 02:56:57,984] [INFO] [logging.py:107:log_dist] [Rank 0] step=2610, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2610 loss: 0.0322 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:57:08,657] [INFO] [logging.py:107:log_dist] [Rank 0] step=2611, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2611 loss: 0.0353 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:57:19,517] [INFO] [logging.py:107:log_dist] [Rank 0] step=2612, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2612 loss: 0.0643 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 02:57:30,187] [INFO] [logging.py:107:log_dist] [Rank 0] step=2613, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2613 loss: 0.2776 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:57:40,858] [INFO] [logging.py:107:log_dist] [Rank 0] step=2614, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2614 loss: 0.0678 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:57:51,532] [INFO] [logging.py:107:log_dist] [Rank 0] step=2615, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2615 loss: 0.0645 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:58:02,199] [INFO] [logging.py:107:log_dist] [Rank 0] step=2616, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2616 loss: 0.1296 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 02:58:12,875] [INFO] [logging.py:107:log_dist] [Rank 0] step=2617, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2617 loss: 0.1110 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 02:58:23,545] [INFO] [logging.py:107:log_dist] [Rank 0] step=2618, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2618 loss: 0.0304 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:58:34,216] [INFO] [logging.py:107:log_dist] [Rank 0] step=2619, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2619 loss: 0.0660 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 02:58:45,076] [INFO] [logging.py:107:log_dist] [Rank 0] step=2620, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2620 loss: 0.0440 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 02:58:55,744] [INFO] [logging.py:107:log_dist] [Rank 0] step=2621, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2621 loss: 0.0726 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 02:59:06,418] [INFO] [logging.py:107:log_dist] [Rank 0] step=2622, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2622 loss: 0.0771 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 02:59:17,102] [INFO] [logging.py:107:log_dist] [Rank 0] step=2623, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2623 loss: 0.0398 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-06 02:59:27,765] [INFO] [logging.py:107:log_dist] [Rank 0] step=2624, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2624 loss: 0.1772 iter time (s): 10.637 samples/sec: 0.094 +Started new epoch: 65 +[2025-05-06 02:59:38,784] [INFO] [logging.py:107:log_dist] [Rank 0] step=2625, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2625 loss: 0.0397 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 02:59:49,461] [INFO] [logging.py:107:log_dist] [Rank 0] step=2626, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2626 loss: 0.1144 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 03:00:00,136] [INFO] [logging.py:107:log_dist] [Rank 0] step=2627, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2627 loss: 0.0435 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:00:10,814] [INFO] [logging.py:107:log_dist] [Rank 0] step=2628, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2628 loss: 0.0984 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 03:00:21,645] [INFO] [logging.py:107:log_dist] [Rank 0] step=2629, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2629 loss: 0.0642 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 03:00:32,318] [INFO] [logging.py:107:log_dist] [Rank 0] step=2630, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2630 loss: 0.0956 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:00:42,987] [INFO] [logging.py:107:log_dist] [Rank 0] step=2631, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2631 loss: 0.0346 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:00:53,661] [INFO] [logging.py:107:log_dist] [Rank 0] step=2632, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2632 loss: 0.0439 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:01:04,333] [INFO] [logging.py:107:log_dist] [Rank 0] step=2633, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2633 loss: 0.0321 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:01:15,003] [INFO] [logging.py:107:log_dist] [Rank 0] step=2634, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2634 loss: 0.0329 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:01:25,675] [INFO] [logging.py:107:log_dist] [Rank 0] step=2635, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2635 loss: 0.0556 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:01:36,345] [INFO] [logging.py:107:log_dist] [Rank 0] step=2636, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2636 loss: 0.1115 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:01:47,200] [INFO] [logging.py:107:log_dist] [Rank 0] step=2637, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2637 loss: 0.1911 iter time (s): 10.825 samples/sec: 0.092 +[2025-05-06 03:01:57,874] [INFO] [logging.py:107:log_dist] [Rank 0] step=2638, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2638 loss: 0.0366 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:02:08,547] [INFO] [logging.py:107:log_dist] [Rank 0] step=2639, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2639 loss: 0.0348 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:02:19,219] [INFO] [logging.py:107:log_dist] [Rank 0] step=2640, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2640 loss: 0.0714 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:02:29,895] [INFO] [logging.py:107:log_dist] [Rank 0] step=2641, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2641 loss: 0.1707 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:02:40,564] [INFO] [logging.py:107:log_dist] [Rank 0] step=2642, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2642 loss: 0.2055 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:02:51,247] [INFO] [logging.py:107:log_dist] [Rank 0] step=2643, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2643 loss: 0.0430 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-06 03:03:01,920] [INFO] [logging.py:107:log_dist] [Rank 0] step=2644, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2644 loss: 0.0900 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:03:12,596] [INFO] [logging.py:107:log_dist] [Rank 0] step=2645, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2645 loss: 0.0413 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:03:23,461] [INFO] [logging.py:107:log_dist] [Rank 0] step=2646, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2646 loss: 0.0365 iter time (s): 10.834 samples/sec: 0.092 +[2025-05-06 03:03:34,134] [INFO] [logging.py:107:log_dist] [Rank 0] step=2647, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2647 loss: 0.0502 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:03:44,806] [INFO] [logging.py:107:log_dist] [Rank 0] step=2648, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2648 loss: 0.0689 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:03:55,492] [INFO] [logging.py:107:log_dist] [Rank 0] step=2649, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2649 loss: 0.0288 iter time (s): 10.655 samples/sec: 0.094 +[2025-05-06 03:04:06,168] [INFO] [logging.py:107:log_dist] [Rank 0] step=2650, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2650 loss: 0.1060 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:04:16,836] [INFO] [logging.py:107:log_dist] [Rank 0] step=2651, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2651 loss: 0.0562 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:04:27,510] [INFO] [logging.py:107:log_dist] [Rank 0] step=2652, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2652 loss: 0.0454 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:04:38,181] [INFO] [logging.py:107:log_dist] [Rank 0] step=2653, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2653 loss: 0.0359 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:04:49,026] [INFO] [logging.py:107:log_dist] [Rank 0] step=2654, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2654 loss: 0.1012 iter time (s): 10.815 samples/sec: 0.092 +[2025-05-06 03:04:59,701] [INFO] [logging.py:107:log_dist] [Rank 0] step=2655, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2655 loss: 0.1453 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:05:10,376] [INFO] [logging.py:107:log_dist] [Rank 0] step=2656, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2656 loss: 0.0551 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:05:21,047] [INFO] [logging.py:107:log_dist] [Rank 0] step=2657, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2657 loss: 0.0566 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:05:31,720] [INFO] [logging.py:107:log_dist] [Rank 0] step=2658, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2658 loss: 0.0361 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:05:42,394] [INFO] [logging.py:107:log_dist] [Rank 0] step=2659, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2659 loss: 0.0303 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:05:53,068] [INFO] [logging.py:107:log_dist] [Rank 0] step=2660, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2660 loss: 0.0432 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:06:03,739] [INFO] [logging.py:107:log_dist] [Rank 0] step=2661, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2661 loss: 0.0521 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:06:14,409] [INFO] [logging.py:107:log_dist] [Rank 0] step=2662, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2662 loss: 0.1050 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:06:25,244] [INFO] [logging.py:107:log_dist] [Rank 0] step=2663, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2663 loss: 0.0933 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-06 03:06:35,917] [INFO] [logging.py:107:log_dist] [Rank 0] step=2664, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2664 loss: 0.0321 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:06:46,584] [INFO] [logging.py:107:log_dist] [Rank 0] step=2665, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2665 loss: 0.0465 iter time (s): 10.640 samples/sec: 0.094 +Started new epoch: 66 +[2025-05-06 03:06:57,595] [INFO] [logging.py:107:log_dist] [Rank 0] step=2666, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2666 loss: 0.0703 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:07:08,270] [INFO] [logging.py:107:log_dist] [Rank 0] step=2667, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2667 loss: 0.0376 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:07:18,938] [INFO] [logging.py:107:log_dist] [Rank 0] step=2668, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2668 loss: 0.0660 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 03:07:29,611] [INFO] [logging.py:107:log_dist] [Rank 0] step=2669, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2669 loss: 0.0304 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:07:40,284] [INFO] [logging.py:107:log_dist] [Rank 0] step=2670, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2670 loss: 0.0911 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:07:50,954] [INFO] [logging.py:107:log_dist] [Rank 0] step=2671, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2671 loss: 0.0275 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:08:01,783] [INFO] [logging.py:107:log_dist] [Rank 0] step=2672, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2672 loss: 0.1736 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-06 03:08:12,458] [INFO] [logging.py:107:log_dist] [Rank 0] step=2673, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2673 loss: 0.0380 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:08:23,133] [INFO] [logging.py:107:log_dist] [Rank 0] step=2674, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2674 loss: 0.0882 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:08:33,804] [INFO] [logging.py:107:log_dist] [Rank 0] step=2675, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2675 loss: 0.0566 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:08:44,477] [INFO] [logging.py:107:log_dist] [Rank 0] step=2676, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2676 loss: 0.0355 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:08:55,151] [INFO] [logging.py:107:log_dist] [Rank 0] step=2677, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2677 loss: 0.0584 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:09:05,829] [INFO] [logging.py:107:log_dist] [Rank 0] step=2678, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2678 loss: 0.0306 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 03:09:16,503] [INFO] [logging.py:107:log_dist] [Rank 0] step=2679, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2679 loss: 0.0344 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:09:27,361] [INFO] [logging.py:107:log_dist] [Rank 0] step=2680, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2680 loss: 0.1114 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 03:09:38,034] [INFO] [logging.py:107:log_dist] [Rank 0] step=2681, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2681 loss: 0.0279 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:09:48,712] [INFO] [logging.py:107:log_dist] [Rank 0] step=2682, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2682 loss: 0.2935 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 03:09:59,384] [INFO] [logging.py:107:log_dist] [Rank 0] step=2683, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2683 loss: 0.0630 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:10:10,060] [INFO] [logging.py:107:log_dist] [Rank 0] step=2684, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2684 loss: 0.0492 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:10:20,737] [INFO] [logging.py:107:log_dist] [Rank 0] step=2685, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2685 loss: 0.0323 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 03:10:31,407] [INFO] [logging.py:107:log_dist] [Rank 0] step=2686, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2686 loss: 0.0298 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:10:42,077] [INFO] [logging.py:107:log_dist] [Rank 0] step=2687, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2687 loss: 0.0423 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:10:52,750] [INFO] [logging.py:107:log_dist] [Rank 0] step=2688, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2688 loss: 0.1564 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:11:03,594] [INFO] [logging.py:107:log_dist] [Rank 0] step=2689, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2689 loss: 0.0271 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-06 03:11:14,263] [INFO] [logging.py:107:log_dist] [Rank 0] step=2690, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2690 loss: 0.0330 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:11:24,939] [INFO] [logging.py:107:log_dist] [Rank 0] step=2691, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2691 loss: 0.0894 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:11:35,610] [INFO] [logging.py:107:log_dist] [Rank 0] step=2692, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2692 loss: 0.0251 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:11:46,283] [INFO] [logging.py:107:log_dist] [Rank 0] step=2693, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2693 loss: 0.1069 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:11:46,285] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step2693 is about to be saved! +[2025-05-06 03:11:46,286] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_00-model_states.pt... +[2025-05-06 03:11:46,287] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_00-model_states.pt. +[2025-05-06 03:11:46,293] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_01-model_states.pt... +[2025-05-06 03:11:46,300] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_01-model_states.pt. +[2025-05-06 03:11:46,306] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_02-model_states.pt... +[2025-05-06 03:11:46,312] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_02-model_states.pt. +[2025-05-06 03:11:46,316] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_03-model_states.pt... +[2025-05-06 03:11:46,323] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_03-model_states.pt. +[2025-05-06 03:11:46,326] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_04-model_states.pt... +[2025-05-06 03:11:46,333] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_04-model_states.pt. +[2025-05-06 03:11:46,337] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_05-model_states.pt... +[2025-05-06 03:11:46,343] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_05-model_states.pt. +[2025-05-06 03:11:46,347] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_06-model_states.pt... +[2025-05-06 03:11:46,353] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_06-model_states.pt. +[2025-05-06 03:11:46,356] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_07-model_states.pt... +[2025-05-06 03:11:46,363] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_07-model_states.pt. +[2025-05-06 03:11:46,366] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_08-model_states.pt... +[2025-05-06 03:11:46,373] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_08-model_states.pt. +[2025-05-06 03:11:46,376] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_09-model_states.pt... +[2025-05-06 03:11:46,383] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_09-model_states.pt. +[2025-05-06 03:11:46,386] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_10-model_states.pt... +[2025-05-06 03:11:46,393] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_10-model_states.pt. +[2025-05-06 03:11:46,396] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_11-model_states.pt... +[2025-05-06 03:11:46,403] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_11-model_states.pt. +[2025-05-06 03:11:46,406] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_12-model_states.pt... +[2025-05-06 03:11:46,413] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_12-model_states.pt. +[2025-05-06 03:11:46,416] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_13-model_states.pt... +[2025-05-06 03:11:46,423] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_13-model_states.pt. +[2025-05-06 03:11:46,426] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_14-model_states.pt... +[2025-05-06 03:11:46,433] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_14-model_states.pt. +[2025-05-06 03:11:46,436] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_15-model_states.pt... +[2025-05-06 03:11:46,443] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_15-model_states.pt. +[2025-05-06 03:11:46,446] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_16-model_states.pt... +[2025-05-06 03:11:46,452] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_16-model_states.pt. +[2025-05-06 03:11:46,456] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_17-model_states.pt... +[2025-05-06 03:11:46,462] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_17-model_states.pt. +[2025-05-06 03:11:46,465] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_18-model_states.pt... +[2025-05-06 03:11:46,472] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_18-model_states.pt. +[2025-05-06 03:11:46,475] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_19-model_states.pt... +[2025-05-06 03:11:46,482] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_19-model_states.pt. +[2025-05-06 03:11:46,485] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_20-model_states.pt... +[2025-05-06 03:11:46,492] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_20-model_states.pt. +[2025-05-06 03:11:46,495] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_21-model_states.pt... +[2025-05-06 03:11:46,501] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_21-model_states.pt. +[2025-05-06 03:11:46,505] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_22-model_states.pt... +[2025-05-06 03:11:46,511] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_22-model_states.pt. +[2025-05-06 03:11:46,514] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_23-model_states.pt... +[2025-05-06 03:11:46,521] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_23-model_states.pt. +[2025-05-06 03:11:46,524] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_24-model_states.pt... +[2025-05-06 03:11:46,531] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_24-model_states.pt. +[2025-05-06 03:11:46,534] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_25-model_states.pt... +[2025-05-06 03:11:46,541] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_25-model_states.pt. +[2025-05-06 03:11:46,544] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_26-model_states.pt... +[2025-05-06 03:11:46,550] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_26-model_states.pt. +[2025-05-06 03:11:46,554] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_27-model_states.pt... +[2025-05-06 03:11:46,560] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_27-model_states.pt. +[2025-05-06 03:11:46,564] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_28-model_states.pt... +[2025-05-06 03:11:46,570] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_28-model_states.pt. +[2025-05-06 03:11:46,573] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_29-model_states.pt... +[2025-05-06 03:11:46,580] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_29-model_states.pt. +[2025-05-06 03:11:46,583] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_30-model_states.pt... +[2025-05-06 03:11:46,590] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_30-model_states.pt. +[2025-05-06 03:11:46,593] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_31-model_states.pt... +[2025-05-06 03:11:46,600] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_31-model_states.pt. +[2025-05-06 03:11:46,603] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_32-model_states.pt... +[2025-05-06 03:11:46,610] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_32-model_states.pt. +[2025-05-06 03:11:46,613] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_33-model_states.pt... +[2025-05-06 03:11:46,620] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_33-model_states.pt. +[2025-05-06 03:11:46,623] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_34-model_states.pt... +[2025-05-06 03:11:46,629] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_34-model_states.pt. +[2025-05-06 03:11:46,633] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_35-model_states.pt... +[2025-05-06 03:11:46,639] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_35-model_states.pt. +[2025-05-06 03:11:46,642] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_36-model_states.pt... +[2025-05-06 03:11:46,649] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_36-model_states.pt. +[2025-05-06 03:11:46,652] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_37-model_states.pt... +[2025-05-06 03:11:46,659] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_37-model_states.pt. +[2025-05-06 03:11:46,662] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_38-model_states.pt... +[2025-05-06 03:11:46,669] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_38-model_states.pt. +[2025-05-06 03:11:46,672] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_39-model_states.pt... +[2025-05-06 03:11:46,679] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_39-model_states.pt. +[2025-05-06 03:11:46,682] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_40-model_states.pt... +[2025-05-06 03:11:46,688] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_40-model_states.pt. +[2025-05-06 03:11:46,689] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_41-model_states.pt... +[2025-05-06 03:11:46,689] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/layer_41-model_states.pt. +[2025-05-06 03:11:46,707] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/mp_rank_00_model_states.pt +[2025-05-06 03:11:46,707] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/mp_rank_00_model_states.pt... +[2025-05-06 03:11:47,717] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step2693/mp_rank_00_model_states.pt. +[2025-05-06 03:11:47,718] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2693 is ready now! +[2025-05-06 03:11:58,392] [INFO] [logging.py:107:log_dist] [Rank 0] step=2694, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2694 loss: 0.0675 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:12:09,062] [INFO] [logging.py:107:log_dist] [Rank 0] step=2695, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2695 loss: 0.0760 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:12:19,750] [INFO] [logging.py:107:log_dist] [Rank 0] step=2696, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2696 loss: 0.3364 iter time (s): 10.656 samples/sec: 0.094 +[2025-05-06 03:12:30,622] [INFO] [logging.py:107:log_dist] [Rank 0] step=2697, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2697 loss: 0.2429 iter time (s): 10.841 samples/sec: 0.092 +[2025-05-06 03:12:41,296] [INFO] [logging.py:107:log_dist] [Rank 0] step=2698, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2698 loss: 0.0902 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:12:51,979] [INFO] [logging.py:107:log_dist] [Rank 0] step=2699, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2699 loss: 0.0836 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-06 03:13:02,649] [INFO] [logging.py:107:log_dist] [Rank 0] step=2700, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2700 loss: 0.0351 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:13:13,321] [INFO] [logging.py:107:log_dist] [Rank 0] step=2701, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2701 loss: 0.0510 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:13:23,994] [INFO] [logging.py:107:log_dist] [Rank 0] step=2702, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2702 loss: 0.0586 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:13:34,663] [INFO] [logging.py:107:log_dist] [Rank 0] step=2703, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2703 loss: 0.1341 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:13:45,336] [INFO] [logging.py:107:log_dist] [Rank 0] step=2704, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2704 loss: 0.0460 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:13:56,011] [INFO] [logging.py:107:log_dist] [Rank 0] step=2705, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2705 loss: 0.1656 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:14:06,838] [INFO] [logging.py:107:log_dist] [Rank 0] step=2706, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2706 loss: 0.0489 iter time (s): 10.800 samples/sec: 0.093 +Started new epoch: 67 +[2025-05-06 03:14:17,842] [INFO] [logging.py:107:log_dist] [Rank 0] step=2707, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2707 loss: 0.0274 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:14:28,516] [INFO] [logging.py:107:log_dist] [Rank 0] step=2708, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2708 loss: 0.0287 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:14:39,186] [INFO] [logging.py:107:log_dist] [Rank 0] step=2709, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2709 loss: 0.0722 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:14:49,851] [INFO] [logging.py:107:log_dist] [Rank 0] step=2710, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2710 loss: 0.0443 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 03:15:00,527] [INFO] [logging.py:107:log_dist] [Rank 0] step=2711, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2711 loss: 0.0491 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:15:11,199] [INFO] [logging.py:107:log_dist] [Rank 0] step=2712, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2712 loss: 0.1226 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:15:21,869] [INFO] [logging.py:107:log_dist] [Rank 0] step=2713, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2713 loss: 0.1077 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:15:32,745] [INFO] [logging.py:107:log_dist] [Rank 0] step=2714, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2714 loss: 0.0389 iter time (s): 10.837 samples/sec: 0.092 +[2025-05-06 03:15:43,422] [INFO] [logging.py:107:log_dist] [Rank 0] step=2715, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2715 loss: 0.1252 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 03:15:54,095] [INFO] [logging.py:107:log_dist] [Rank 0] step=2716, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2716 loss: 0.2650 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:16:04,765] [INFO] [logging.py:107:log_dist] [Rank 0] step=2717, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2717 loss: 0.0502 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:16:15,435] [INFO] [logging.py:107:log_dist] [Rank 0] step=2718, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2718 loss: 0.0278 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:16:26,104] [INFO] [logging.py:107:log_dist] [Rank 0] step=2719, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2719 loss: 0.0434 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:16:36,780] [INFO] [logging.py:107:log_dist] [Rank 0] step=2720, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2720 loss: 0.0374 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:16:47,449] [INFO] [logging.py:107:log_dist] [Rank 0] step=2721, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2721 loss: 0.0412 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:16:58,120] [INFO] [logging.py:107:log_dist] [Rank 0] step=2722, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2722 loss: 0.1150 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:17:08,954] [INFO] [logging.py:107:log_dist] [Rank 0] step=2723, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2723 loss: 0.0474 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-06 03:17:19,627] [INFO] [logging.py:107:log_dist] [Rank 0] step=2724, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2724 loss: 0.0701 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:17:30,295] [INFO] [logging.py:107:log_dist] [Rank 0] step=2725, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2725 loss: 0.0777 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:17:40,971] [INFO] [logging.py:107:log_dist] [Rank 0] step=2726, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2726 loss: 0.1353 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:17:51,642] [INFO] [logging.py:107:log_dist] [Rank 0] step=2727, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2727 loss: 0.0322 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:18:02,312] [INFO] [logging.py:107:log_dist] [Rank 0] step=2728, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2728 loss: 0.0413 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:18:12,984] [INFO] [logging.py:107:log_dist] [Rank 0] step=2729, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2729 loss: 0.0697 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:18:23,658] [INFO] [logging.py:107:log_dist] [Rank 0] step=2730, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2730 loss: 0.0362 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:18:34,327] [INFO] [logging.py:107:log_dist] [Rank 0] step=2731, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2731 loss: 0.0322 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:18:45,187] [INFO] [logging.py:107:log_dist] [Rank 0] step=2732, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2732 loss: 0.0291 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 03:18:55,856] [INFO] [logging.py:107:log_dist] [Rank 0] step=2733, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2733 loss: 0.3695 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:19:06,527] [INFO] [logging.py:107:log_dist] [Rank 0] step=2734, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2734 loss: 0.0743 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:19:17,202] [INFO] [logging.py:107:log_dist] [Rank 0] step=2735, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2735 loss: 0.0299 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:19:27,873] [INFO] [logging.py:107:log_dist] [Rank 0] step=2736, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2736 loss: 0.0484 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:19:38,544] [INFO] [logging.py:107:log_dist] [Rank 0] step=2737, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2737 loss: 0.0695 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:19:49,223] [INFO] [logging.py:107:log_dist] [Rank 0] step=2738, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2738 loss: 0.0376 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:19:59,894] [INFO] [logging.py:107:log_dist] [Rank 0] step=2739, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2739 loss: 0.1858 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:20:10,731] [INFO] [logging.py:107:log_dist] [Rank 0] step=2740, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2740 loss: 0.0405 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-06 03:20:21,405] [INFO] [logging.py:107:log_dist] [Rank 0] step=2741, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2741 loss: 0.0631 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:20:32,074] [INFO] [logging.py:107:log_dist] [Rank 0] step=2742, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2742 loss: 0.0867 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:20:42,745] [INFO] [logging.py:107:log_dist] [Rank 0] step=2743, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2743 loss: 0.0745 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:20:53,421] [INFO] [logging.py:107:log_dist] [Rank 0] step=2744, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2744 loss: 0.1194 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 03:21:04,090] [INFO] [logging.py:107:log_dist] [Rank 0] step=2745, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2745 loss: 0.0487 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:21:14,759] [INFO] [logging.py:107:log_dist] [Rank 0] step=2746, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2746 loss: 0.0752 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:21:25,432] [INFO] [logging.py:107:log_dist] [Rank 0] step=2747, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2747 loss: 0.0476 iter time (s): 10.644 samples/sec: 0.094 +Started new epoch: 68 +[2025-05-06 03:21:36,444] [INFO] [logging.py:107:log_dist] [Rank 0] step=2748, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2748 loss: 0.0386 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:21:47,302] [INFO] [logging.py:107:log_dist] [Rank 0] step=2749, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2749 loss: 0.0459 iter time (s): 10.815 samples/sec: 0.092 +[2025-05-06 03:21:57,974] [INFO] [logging.py:107:log_dist] [Rank 0] step=2750, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2750 loss: 0.0415 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:22:08,646] [INFO] [logging.py:107:log_dist] [Rank 0] step=2751, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2751 loss: 0.0668 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:22:19,324] [INFO] [logging.py:107:log_dist] [Rank 0] step=2752, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2752 loss: 0.0384 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:22:29,994] [INFO] [logging.py:107:log_dist] [Rank 0] step=2753, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2753 loss: 0.0288 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:22:40,664] [INFO] [logging.py:107:log_dist] [Rank 0] step=2754, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2754 loss: 0.0331 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:22:51,344] [INFO] [logging.py:107:log_dist] [Rank 0] step=2755, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2755 loss: 0.0859 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:23:02,030] [INFO] [logging.py:107:log_dist] [Rank 0] step=2756, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2756 loss: 0.0599 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:23:12,912] [INFO] [logging.py:107:log_dist] [Rank 0] step=2757, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2757 loss: 0.0315 iter time (s): 10.850 samples/sec: 0.092 +[2025-05-06 03:23:23,588] [INFO] [logging.py:107:log_dist] [Rank 0] step=2758, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2758 loss: 0.0490 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:23:34,261] [INFO] [logging.py:107:log_dist] [Rank 0] step=2759, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2759 loss: 0.1383 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:23:44,929] [INFO] [logging.py:107:log_dist] [Rank 0] step=2760, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2760 loss: 0.0310 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:23:55,600] [INFO] [logging.py:107:log_dist] [Rank 0] step=2761, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2761 loss: 0.0617 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:24:06,276] [INFO] [logging.py:107:log_dist] [Rank 0] step=2762, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2762 loss: 0.0374 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:24:16,961] [INFO] [logging.py:107:log_dist] [Rank 0] step=2763, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2763 loss: 0.1303 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:24:27,636] [INFO] [logging.py:107:log_dist] [Rank 0] step=2764, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2764 loss: 0.0396 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:24:38,307] [INFO] [logging.py:107:log_dist] [Rank 0] step=2765, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2765 loss: 0.1099 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:24:49,135] [INFO] [logging.py:107:log_dist] [Rank 0] step=2766, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2766 loss: 0.0591 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 03:24:59,815] [INFO] [logging.py:107:log_dist] [Rank 0] step=2767, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2767 loss: 0.0480 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 03:25:10,491] [INFO] [logging.py:107:log_dist] [Rank 0] step=2768, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2768 loss: 0.0448 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:25:21,162] [INFO] [logging.py:107:log_dist] [Rank 0] step=2769, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2769 loss: 0.0342 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:25:31,832] [INFO] [logging.py:107:log_dist] [Rank 0] step=2770, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2770 loss: 0.0652 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:25:42,501] [INFO] [logging.py:107:log_dist] [Rank 0] step=2771, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2771 loss: 0.0292 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:25:53,171] [INFO] [logging.py:107:log_dist] [Rank 0] step=2772, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2772 loss: 0.0499 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:26:03,847] [INFO] [logging.py:107:log_dist] [Rank 0] step=2773, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2773 loss: 0.0505 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:26:14,518] [INFO] [logging.py:107:log_dist] [Rank 0] step=2774, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2774 loss: 0.0600 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:26:25,355] [INFO] [logging.py:107:log_dist] [Rank 0] step=2775, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2775 loss: 0.0737 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-06 03:26:36,027] [INFO] [logging.py:107:log_dist] [Rank 0] step=2776, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2776 loss: 0.0464 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:26:46,697] [INFO] [logging.py:107:log_dist] [Rank 0] step=2777, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2777 loss: 0.0289 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:26:57,371] [INFO] [logging.py:107:log_dist] [Rank 0] step=2778, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2778 loss: 0.3940 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 03:27:08,049] [INFO] [logging.py:107:log_dist] [Rank 0] step=2779, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2779 loss: 0.3001 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 03:27:18,721] [INFO] [logging.py:107:log_dist] [Rank 0] step=2780, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2780 loss: 0.0431 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:27:29,393] [INFO] [logging.py:107:log_dist] [Rank 0] step=2781, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2781 loss: 0.0338 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:27:40,065] [INFO] [logging.py:107:log_dist] [Rank 0] step=2782, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2782 loss: 0.0653 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:27:50,733] [INFO] [logging.py:107:log_dist] [Rank 0] step=2783, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2783 loss: 0.0369 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:28:01,563] [INFO] [logging.py:107:log_dist] [Rank 0] step=2784, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2784 loss: 0.0412 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-06 03:28:12,235] [INFO] [logging.py:107:log_dist] [Rank 0] step=2785, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2785 loss: 0.1235 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:28:22,902] [INFO] [logging.py:107:log_dist] [Rank 0] step=2786, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2786 loss: 0.1112 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:28:33,576] [INFO] [logging.py:107:log_dist] [Rank 0] step=2787, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2787 loss: 0.1922 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:28:44,242] [INFO] [logging.py:107:log_dist] [Rank 0] step=2788, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2788 loss: 0.0430 iter time (s): 10.639 samples/sec: 0.094 +Started new epoch: 69 +[2025-05-06 03:28:55,254] [INFO] [logging.py:107:log_dist] [Rank 0] step=2789, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2789 loss: 0.1102 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:29:05,923] [INFO] [logging.py:107:log_dist] [Rank 0] step=2790, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2790 loss: 0.1253 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:29:16,600] [INFO] [logging.py:107:log_dist] [Rank 0] step=2791, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2791 loss: 0.0337 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 03:29:27,457] [INFO] [logging.py:107:log_dist] [Rank 0] step=2792, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2792 loss: 0.0298 iter time (s): 10.826 samples/sec: 0.092 +[2025-05-06 03:29:38,129] [INFO] [logging.py:107:log_dist] [Rank 0] step=2793, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2793 loss: 0.0620 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:29:48,800] [INFO] [logging.py:107:log_dist] [Rank 0] step=2794, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2794 loss: 0.0652 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:29:59,470] [INFO] [logging.py:107:log_dist] [Rank 0] step=2795, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2795 loss: 0.0499 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:30:10,149] [INFO] [logging.py:107:log_dist] [Rank 0] step=2796, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2796 loss: 0.0737 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 03:30:20,824] [INFO] [logging.py:107:log_dist] [Rank 0] step=2797, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2797 loss: 0.0312 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:30:31,501] [INFO] [logging.py:107:log_dist] [Rank 0] step=2798, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2798 loss: 0.2969 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:30:42,183] [INFO] [logging.py:107:log_dist] [Rank 0] step=2799, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2799 loss: 0.2094 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 03:30:52,863] [INFO] [logging.py:107:log_dist] [Rank 0] step=2800, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2800 loss: 0.0356 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:31:03,722] [INFO] [logging.py:107:log_dist] [Rank 0] step=2801, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2801 loss: 0.0546 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 03:31:14,395] [INFO] [logging.py:107:log_dist] [Rank 0] step=2802, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2802 loss: 0.0366 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:31:25,063] [INFO] [logging.py:107:log_dist] [Rank 0] step=2803, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2803 loss: 0.3145 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 03:31:35,730] [INFO] [logging.py:107:log_dist] [Rank 0] step=2804, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2804 loss: 0.0408 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:31:46,403] [INFO] [logging.py:107:log_dist] [Rank 0] step=2805, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2805 loss: 0.0704 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:31:57,078] [INFO] [logging.py:107:log_dist] [Rank 0] step=2806, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2806 loss: 0.0413 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:32:07,747] [INFO] [logging.py:107:log_dist] [Rank 0] step=2807, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2807 loss: 0.0480 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:32:18,419] [INFO] [logging.py:107:log_dist] [Rank 0] step=2808, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2808 loss: 0.0351 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:32:29,091] [INFO] [logging.py:107:log_dist] [Rank 0] step=2809, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2809 loss: 0.0784 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:32:39,920] [INFO] [logging.py:107:log_dist] [Rank 0] step=2810, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2810 loss: 0.0305 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 03:32:50,597] [INFO] [logging.py:107:log_dist] [Rank 0] step=2811, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2811 loss: 0.0370 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:33:01,270] [INFO] [logging.py:107:log_dist] [Rank 0] step=2812, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2812 loss: 0.0482 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:33:11,946] [INFO] [logging.py:107:log_dist] [Rank 0] step=2813, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2813 loss: 0.1707 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-06 03:33:22,632] [INFO] [logging.py:107:log_dist] [Rank 0] step=2814, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2814 loss: 0.0282 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 03:33:33,309] [INFO] [logging.py:107:log_dist] [Rank 0] step=2815, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2815 loss: 0.0733 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 03:33:43,984] [INFO] [logging.py:107:log_dist] [Rank 0] step=2816, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2816 loss: 0.0296 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:33:54,659] [INFO] [logging.py:107:log_dist] [Rank 0] step=2817, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2817 loss: 0.0357 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:34:05,520] [INFO] [logging.py:107:log_dist] [Rank 0] step=2818, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2818 loss: 0.1221 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-06 03:34:16,190] [INFO] [logging.py:107:log_dist] [Rank 0] step=2819, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2819 loss: 0.1316 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:34:26,870] [INFO] [logging.py:107:log_dist] [Rank 0] step=2820, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2820 loss: 0.0325 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-06 03:34:37,582] [INFO] [logging.py:107:log_dist] [Rank 0] step=2821, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2821 loss: 0.0310 iter time (s): 10.679 samples/sec: 0.094 +[2025-05-06 03:34:48,255] [INFO] [logging.py:107:log_dist] [Rank 0] step=2822, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2822 loss: 0.0626 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:34:58,930] [INFO] [logging.py:107:log_dist] [Rank 0] step=2823, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2823 loss: 0.0637 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:35:09,605] [INFO] [logging.py:107:log_dist] [Rank 0] step=2824, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2824 loss: 0.1481 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:35:20,273] [INFO] [logging.py:107:log_dist] [Rank 0] step=2825, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2825 loss: 0.0424 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:35:30,947] [INFO] [logging.py:107:log_dist] [Rank 0] step=2826, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2826 loss: 0.0817 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:35:41,807] [INFO] [logging.py:107:log_dist] [Rank 0] step=2827, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2827 loss: 0.0457 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-06 03:35:52,478] [INFO] [logging.py:107:log_dist] [Rank 0] step=2828, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2828 loss: 0.3204 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:36:03,144] [INFO] [logging.py:107:log_dist] [Rank 0] step=2829, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2829 loss: 0.1207 iter time (s): 10.640 samples/sec: 0.094 +Started new epoch: 70 +[2025-05-06 03:36:14,159] [INFO] [logging.py:107:log_dist] [Rank 0] step=2830, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2830 loss: 0.1764 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:36:24,831] [INFO] [logging.py:107:log_dist] [Rank 0] step=2831, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2831 loss: 0.0740 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:36:35,503] [INFO] [logging.py:107:log_dist] [Rank 0] step=2832, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2832 loss: 0.1150 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:36:46,174] [INFO] [logging.py:107:log_dist] [Rank 0] step=2833, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2833 loss: 0.1386 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:36:56,841] [INFO] [logging.py:107:log_dist] [Rank 0] step=2834, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2834 loss: 0.0380 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 03:37:07,516] [INFO] [logging.py:107:log_dist] [Rank 0] step=2835, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2835 loss: 0.0413 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:37:18,341] [INFO] [logging.py:107:log_dist] [Rank 0] step=2836, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2836 loss: 0.0321 iter time (s): 10.795 samples/sec: 0.093 +[2025-05-06 03:37:29,013] [INFO] [logging.py:107:log_dist] [Rank 0] step=2837, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2837 loss: 0.0680 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:37:39,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=2838, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2838 loss: 0.1142 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:37:50,359] [INFO] [logging.py:107:log_dist] [Rank 0] step=2839, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2839 loss: 0.0296 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:38:01,027] [INFO] [logging.py:107:log_dist] [Rank 0] step=2840, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2840 loss: 0.0696 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:38:11,701] [INFO] [logging.py:107:log_dist] [Rank 0] step=2841, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2841 loss: 0.1134 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:38:22,372] [INFO] [logging.py:107:log_dist] [Rank 0] step=2842, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2842 loss: 0.0543 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:38:33,042] [INFO] [logging.py:107:log_dist] [Rank 0] step=2843, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2843 loss: 0.0515 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:38:43,883] [INFO] [logging.py:107:log_dist] [Rank 0] step=2844, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2844 loss: 0.0396 iter time (s): 10.811 samples/sec: 0.093 +[2025-05-06 03:38:54,549] [INFO] [logging.py:107:log_dist] [Rank 0] step=2845, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2845 loss: 0.0482 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:39:05,223] [INFO] [logging.py:107:log_dist] [Rank 0] step=2846, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2846 loss: 0.2826 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:39:15,906] [INFO] [logging.py:107:log_dist] [Rank 0] step=2847, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2847 loss: 0.0348 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-06 03:39:26,574] [INFO] [logging.py:107:log_dist] [Rank 0] step=2848, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2848 loss: 0.0326 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 03:39:37,244] [INFO] [logging.py:107:log_dist] [Rank 0] step=2849, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2849 loss: 0.0277 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:39:47,922] [INFO] [logging.py:107:log_dist] [Rank 0] step=2850, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2850 loss: 0.1069 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 03:39:58,590] [INFO] [logging.py:107:log_dist] [Rank 0] step=2851, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2851 loss: 0.0331 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:40:09,264] [INFO] [logging.py:107:log_dist] [Rank 0] step=2852, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2852 loss: 0.1073 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:40:20,099] [INFO] [logging.py:107:log_dist] [Rank 0] step=2853, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2853 loss: 0.0434 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-06 03:40:30,768] [INFO] [logging.py:107:log_dist] [Rank 0] step=2854, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2854 loss: 0.0586 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:40:41,447] [INFO] [logging.py:107:log_dist] [Rank 0] step=2855, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2855 loss: 0.0545 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 03:40:52,118] [INFO] [logging.py:107:log_dist] [Rank 0] step=2856, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2856 loss: 0.0285 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:41:02,787] [INFO] [logging.py:107:log_dist] [Rank 0] step=2857, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2857 loss: 0.0364 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:41:13,463] [INFO] [logging.py:107:log_dist] [Rank 0] step=2858, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2858 loss: 0.0536 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:41:24,134] [INFO] [logging.py:107:log_dist] [Rank 0] step=2859, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2859 loss: 0.0653 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:41:34,811] [INFO] [logging.py:107:log_dist] [Rank 0] step=2860, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2860 loss: 0.2568 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:41:45,686] [INFO] [logging.py:107:log_dist] [Rank 0] step=2861, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2861 loss: 0.0538 iter time (s): 10.844 samples/sec: 0.092 +[2025-05-06 03:41:56,359] [INFO] [logging.py:107:log_dist] [Rank 0] step=2862, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2862 loss: 0.0511 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:42:07,030] [INFO] [logging.py:107:log_dist] [Rank 0] step=2863, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2863 loss: 0.2352 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:42:17,705] [INFO] [logging.py:107:log_dist] [Rank 0] step=2864, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2864 loss: 0.0346 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:42:28,375] [INFO] [logging.py:107:log_dist] [Rank 0] step=2865, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2865 loss: 0.1713 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:42:39,056] [INFO] [logging.py:107:log_dist] [Rank 0] step=2866, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2866 loss: 0.0718 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-06 03:42:49,730] [INFO] [logging.py:107:log_dist] [Rank 0] step=2867, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2867 loss: 0.1202 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:43:00,400] [INFO] [logging.py:107:log_dist] [Rank 0] step=2868, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2868 loss: 0.5268 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:43:11,067] [INFO] [logging.py:107:log_dist] [Rank 0] step=2869, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2869 loss: 0.0446 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:43:21,922] [INFO] [logging.py:107:log_dist] [Rank 0] step=2870, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2870 loss: 0.0404 iter time (s): 10.828 samples/sec: 0.092 +Saving model to directory epoch70 +Started new epoch: 71 +[2025-05-06 03:43:34,480] [INFO] [logging.py:107:log_dist] [Rank 0] step=2871, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2871 loss: 0.0608 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:43:45,150] [INFO] [logging.py:107:log_dist] [Rank 0] step=2872, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2872 loss: 0.1061 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:43:55,821] [INFO] [logging.py:107:log_dist] [Rank 0] step=2873, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2873 loss: 0.1335 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:44:06,494] [INFO] [logging.py:107:log_dist] [Rank 0] step=2874, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2874 loss: 0.0666 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:44:17,162] [INFO] [logging.py:107:log_dist] [Rank 0] step=2875, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2875 loss: 0.0444 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:44:27,835] [INFO] [logging.py:107:log_dist] [Rank 0] step=2876, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2876 loss: 0.0446 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:44:38,505] [INFO] [logging.py:107:log_dist] [Rank 0] step=2877, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2877 loss: 0.0765 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:44:49,333] [INFO] [logging.py:107:log_dist] [Rank 0] step=2878, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2878 loss: 0.0336 iter time (s): 10.796 samples/sec: 0.093 +[2025-05-06 03:45:00,009] [INFO] [logging.py:107:log_dist] [Rank 0] step=2879, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2879 loss: 0.3841 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:45:10,693] [INFO] [logging.py:107:log_dist] [Rank 0] step=2880, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2880 loss: 0.0295 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:45:21,366] [INFO] [logging.py:107:log_dist] [Rank 0] step=2881, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2881 loss: 0.0785 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:45:32,037] [INFO] [logging.py:107:log_dist] [Rank 0] step=2882, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2882 loss: 0.0551 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:45:42,709] [INFO] [logging.py:107:log_dist] [Rank 0] step=2883, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2883 loss: 0.1172 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:45:53,379] [INFO] [logging.py:107:log_dist] [Rank 0] step=2884, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2884 loss: 0.0366 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:46:04,054] [INFO] [logging.py:107:log_dist] [Rank 0] step=2885, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2885 loss: 0.0616 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:46:14,725] [INFO] [logging.py:107:log_dist] [Rank 0] step=2886, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2886 loss: 0.0392 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:46:25,565] [INFO] [logging.py:107:log_dist] [Rank 0] step=2887, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2887 loss: 0.0426 iter time (s): 10.809 samples/sec: 0.093 +[2025-05-06 03:46:36,240] [INFO] [logging.py:107:log_dist] [Rank 0] step=2888, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2888 loss: 0.0692 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:46:46,928] [INFO] [logging.py:107:log_dist] [Rank 0] step=2889, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2889 loss: 0.1085 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:46:57,597] [INFO] [logging.py:107:log_dist] [Rank 0] step=2890, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2890 loss: 0.0551 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:47:08,271] [INFO] [logging.py:107:log_dist] [Rank 0] step=2891, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2891 loss: 0.0289 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:47:18,942] [INFO] [logging.py:107:log_dist] [Rank 0] step=2892, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2892 loss: 0.0686 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:47:29,611] [INFO] [logging.py:107:log_dist] [Rank 0] step=2893, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2893 loss: 0.1760 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:47:40,284] [INFO] [logging.py:107:log_dist] [Rank 0] step=2894, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2894 loss: 0.0840 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:47:50,955] [INFO] [logging.py:107:log_dist] [Rank 0] step=2895, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2895 loss: 0.1434 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:48:01,791] [INFO] [logging.py:107:log_dist] [Rank 0] step=2896, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2896 loss: 0.0674 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-06 03:48:12,463] [INFO] [logging.py:107:log_dist] [Rank 0] step=2897, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2897 loss: 0.0288 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:48:23,132] [INFO] [logging.py:107:log_dist] [Rank 0] step=2898, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2898 loss: 0.0630 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:48:33,800] [INFO] [logging.py:107:log_dist] [Rank 0] step=2899, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2899 loss: 0.0305 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:48:44,474] [INFO] [logging.py:107:log_dist] [Rank 0] step=2900, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2900 loss: 0.0359 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:48:55,146] [INFO] [logging.py:107:log_dist] [Rank 0] step=2901, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2901 loss: 0.0591 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:49:05,826] [INFO] [logging.py:107:log_dist] [Rank 0] step=2902, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2902 loss: 0.0474 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 03:49:16,499] [INFO] [logging.py:107:log_dist] [Rank 0] step=2903, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2903 loss: 0.0312 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:49:27,358] [INFO] [logging.py:107:log_dist] [Rank 0] step=2904, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2904 loss: 0.0990 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 03:49:38,034] [INFO] [logging.py:107:log_dist] [Rank 0] step=2905, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2905 loss: 0.0688 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:49:48,705] [INFO] [logging.py:107:log_dist] [Rank 0] step=2906, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2906 loss: 0.1678 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:49:59,379] [INFO] [logging.py:107:log_dist] [Rank 0] step=2907, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2907 loss: 0.0393 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:50:10,074] [INFO] [logging.py:107:log_dist] [Rank 0] step=2908, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2908 loss: 0.0301 iter time (s): 10.665 samples/sec: 0.094 +[2025-05-06 03:50:20,747] [INFO] [logging.py:107:log_dist] [Rank 0] step=2909, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2909 loss: 0.0665 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:50:31,416] [INFO] [logging.py:107:log_dist] [Rank 0] step=2910, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2910 loss: 0.0388 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:50:42,087] [INFO] [logging.py:107:log_dist] [Rank 0] step=2911, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2911 loss: 0.1015 iter time (s): 10.644 samples/sec: 0.094 +Started new epoch: 72 +[2025-05-06 03:50:53,099] [INFO] [logging.py:107:log_dist] [Rank 0] step=2912, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2912 loss: 0.0758 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:51:03,957] [INFO] [logging.py:107:log_dist] [Rank 0] step=2913, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2913 loss: 0.0442 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 03:51:14,634] [INFO] [logging.py:107:log_dist] [Rank 0] step=2914, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2914 loss: 0.0457 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:51:25,306] [INFO] [logging.py:107:log_dist] [Rank 0] step=2915, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2915 loss: 0.0551 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:51:35,976] [INFO] [logging.py:107:log_dist] [Rank 0] step=2916, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2916 loss: 0.2290 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:51:46,650] [INFO] [logging.py:107:log_dist] [Rank 0] step=2917, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2917 loss: 0.0438 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:51:57,318] [INFO] [logging.py:107:log_dist] [Rank 0] step=2918, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2918 loss: 0.0319 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:52:07,988] [INFO] [logging.py:107:log_dist] [Rank 0] step=2919, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2919 loss: 0.0652 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:52:18,661] [INFO] [logging.py:107:log_dist] [Rank 0] step=2920, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2920 loss: 0.0362 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:52:29,331] [INFO] [logging.py:107:log_dist] [Rank 0] step=2921, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2921 loss: 0.0705 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:52:40,160] [INFO] [logging.py:107:log_dist] [Rank 0] step=2922, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2922 loss: 0.0312 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 03:52:50,831] [INFO] [logging.py:107:log_dist] [Rank 0] step=2923, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2923 loss: 0.1648 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:53:01,506] [INFO] [logging.py:107:log_dist] [Rank 0] step=2924, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2924 loss: 0.1631 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 03:53:12,175] [INFO] [logging.py:107:log_dist] [Rank 0] step=2925, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2925 loss: 0.1795 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:53:22,849] [INFO] [logging.py:107:log_dist] [Rank 0] step=2926, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2926 loss: 0.1582 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:53:33,532] [INFO] [logging.py:107:log_dist] [Rank 0] step=2927, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2927 loss: 0.0833 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:53:44,204] [INFO] [logging.py:107:log_dist] [Rank 0] step=2928, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2928 loss: 0.0301 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:53:54,876] [INFO] [logging.py:107:log_dist] [Rank 0] step=2929, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2929 loss: 0.0429 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:54:05,737] [INFO] [logging.py:107:log_dist] [Rank 0] step=2930, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2930 loss: 0.0316 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 03:54:16,408] [INFO] [logging.py:107:log_dist] [Rank 0] step=2931, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2931 loss: 0.1938 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:54:27,083] [INFO] [logging.py:107:log_dist] [Rank 0] step=2932, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2932 loss: 0.0715 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:54:37,754] [INFO] [logging.py:107:log_dist] [Rank 0] step=2933, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2933 loss: 0.0310 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:54:48,426] [INFO] [logging.py:107:log_dist] [Rank 0] step=2934, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2934 loss: 0.0503 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:54:59,099] [INFO] [logging.py:107:log_dist] [Rank 0] step=2935, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2935 loss: 0.0296 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:55:09,772] [INFO] [logging.py:107:log_dist] [Rank 0] step=2936, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2936 loss: 0.0278 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:55:20,441] [INFO] [logging.py:107:log_dist] [Rank 0] step=2937, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2937 loss: 0.2498 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:55:31,115] [INFO] [logging.py:107:log_dist] [Rank 0] step=2938, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2938 loss: 0.0317 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:55:41,975] [INFO] [logging.py:107:log_dist] [Rank 0] step=2939, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2939 loss: 0.1749 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 03:55:52,657] [INFO] [logging.py:107:log_dist] [Rank 0] step=2940, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2940 loss: 0.0442 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 03:56:03,337] [INFO] [logging.py:107:log_dist] [Rank 0] step=2941, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2941 loss: 0.0770 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 03:56:14,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=2942, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2942 loss: 0.0386 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:56:24,673] [INFO] [logging.py:107:log_dist] [Rank 0] step=2943, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2943 loss: 0.0845 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 03:56:35,352] [INFO] [logging.py:107:log_dist] [Rank 0] step=2944, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2944 loss: 0.1915 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 03:56:46,023] [INFO] [logging.py:107:log_dist] [Rank 0] step=2945, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2945 loss: 0.1068 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:56:56,692] [INFO] [logging.py:107:log_dist] [Rank 0] step=2946, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2946 loss: 0.0541 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:57:07,377] [INFO] [logging.py:107:log_dist] [Rank 0] step=2947, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2947 loss: 0.0367 iter time (s): 10.654 samples/sec: 0.094 +[2025-05-06 03:57:18,207] [INFO] [logging.py:107:log_dist] [Rank 0] step=2948, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2948 loss: 0.0427 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-06 03:57:28,877] [INFO] [logging.py:107:log_dist] [Rank 0] step=2949, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2949 loss: 0.0386 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:57:39,548] [INFO] [logging.py:107:log_dist] [Rank 0] step=2950, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2950 loss: 0.1470 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 03:57:50,217] [INFO] [logging.py:107:log_dist] [Rank 0] step=2951, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2951 loss: 0.1073 iter time (s): 10.673 samples/sec: 0.094 +[2025-05-06 03:58:00,921] [INFO] [logging.py:107:log_dist] [Rank 0] step=2952, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2952 loss: 0.0584 iter time (s): 10.642 samples/sec: 0.094 +Started new epoch: 73 +[2025-05-06 03:58:11,937] [INFO] [logging.py:107:log_dist] [Rank 0] step=2953, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2953 loss: 0.1629 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 03:58:22,608] [INFO] [logging.py:107:log_dist] [Rank 0] step=2954, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2954 loss: 0.1301 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 03:58:33,278] [INFO] [logging.py:107:log_dist] [Rank 0] step=2955, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2955 loss: 0.1382 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 03:58:44,123] [INFO] [logging.py:107:log_dist] [Rank 0] step=2956, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2956 loss: 0.0548 iter time (s): 10.814 samples/sec: 0.092 +[2025-05-06 03:58:54,799] [INFO] [logging.py:107:log_dist] [Rank 0] step=2957, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2957 loss: 0.1198 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 03:59:05,472] [INFO] [logging.py:107:log_dist] [Rank 0] step=2958, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2958 loss: 0.1400 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 03:59:16,143] [INFO] [logging.py:107:log_dist] [Rank 0] step=2959, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2959 loss: 0.0405 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 03:59:26,810] [INFO] [logging.py:107:log_dist] [Rank 0] step=2960, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2960 loss: 0.0365 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:59:37,484] [INFO] [logging.py:107:log_dist] [Rank 0] step=2961, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2961 loss: 0.1099 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 03:59:48,152] [INFO] [logging.py:107:log_dist] [Rank 0] step=2962, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2962 loss: 0.0317 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 03:59:58,820] [INFO] [logging.py:107:log_dist] [Rank 0] step=2963, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2963 loss: 0.2290 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:00:09,501] [INFO] [logging.py:107:log_dist] [Rank 0] step=2964, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2964 loss: 0.0626 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 04:00:20,336] [INFO] [logging.py:107:log_dist] [Rank 0] step=2965, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2965 loss: 0.0301 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-06 04:00:31,005] [INFO] [logging.py:107:log_dist] [Rank 0] step=2966, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2966 loss: 0.1115 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:00:41,674] [INFO] [logging.py:107:log_dist] [Rank 0] step=2967, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2967 loss: 0.2052 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:00:52,347] [INFO] [logging.py:107:log_dist] [Rank 0] step=2968, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2968 loss: 0.0304 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:01:03,015] [INFO] [logging.py:107:log_dist] [Rank 0] step=2969, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2969 loss: 0.0409 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:01:13,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=2970, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2970 loss: 0.0470 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:01:24,357] [INFO] [logging.py:107:log_dist] [Rank 0] step=2971, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2971 loss: 0.0873 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:01:35,027] [INFO] [logging.py:107:log_dist] [Rank 0] step=2972, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2972 loss: 0.0282 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:01:45,891] [INFO] [logging.py:107:log_dist] [Rank 0] step=2973, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2973 loss: 0.1468 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-06 04:01:56,563] [INFO] [logging.py:107:log_dist] [Rank 0] step=2974, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2974 loss: 0.0527 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:02:07,236] [INFO] [logging.py:107:log_dist] [Rank 0] step=2975, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2975 loss: 0.0752 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:02:17,913] [INFO] [logging.py:107:log_dist] [Rank 0] step=2976, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2976 loss: 0.0338 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:02:28,583] [INFO] [logging.py:107:log_dist] [Rank 0] step=2977, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2977 loss: 0.1228 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:02:39,256] [INFO] [logging.py:107:log_dist] [Rank 0] step=2978, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2978 loss: 0.0265 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:02:49,934] [INFO] [logging.py:107:log_dist] [Rank 0] step=2979, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2979 loss: 0.0386 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 04:03:00,603] [INFO] [logging.py:107:log_dist] [Rank 0] step=2980, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2980 loss: 0.0262 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:03:11,271] [INFO] [logging.py:107:log_dist] [Rank 0] step=2981, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2981 loss: 0.1263 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:03:22,131] [INFO] [logging.py:107:log_dist] [Rank 0] step=2982, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2982 loss: 0.0453 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 04:03:32,801] [INFO] [logging.py:107:log_dist] [Rank 0] step=2983, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2983 loss: 0.0629 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:03:43,473] [INFO] [logging.py:107:log_dist] [Rank 0] step=2984, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2984 loss: 0.0657 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:03:54,143] [INFO] [logging.py:107:log_dist] [Rank 0] step=2985, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2985 loss: 0.1065 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:04:04,813] [INFO] [logging.py:107:log_dist] [Rank 0] step=2986, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2986 loss: 0.0427 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:04:15,484] [INFO] [logging.py:107:log_dist] [Rank 0] step=2987, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2987 loss: 0.0299 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:04:26,160] [INFO] [logging.py:107:log_dist] [Rank 0] step=2988, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2988 loss: 0.0995 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:04:36,831] [INFO] [logging.py:107:log_dist] [Rank 0] step=2989, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2989 loss: 0.1254 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:04:47,499] [INFO] [logging.py:107:log_dist] [Rank 0] step=2990, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2990 loss: 0.1138 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:04:58,332] [INFO] [logging.py:107:log_dist] [Rank 0] step=2991, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2991 loss: 0.0410 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-06 04:05:09,004] [INFO] [logging.py:107:log_dist] [Rank 0] step=2992, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2992 loss: 0.0696 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:05:19,666] [INFO] [logging.py:107:log_dist] [Rank 0] step=2993, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2993 loss: 0.0486 iter time (s): 10.635 samples/sec: 0.094 +Started new epoch: 74 +[2025-05-06 04:05:30,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=2994, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2994 loss: 0.0553 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:05:41,345] [INFO] [logging.py:107:log_dist] [Rank 0] step=2995, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2995 loss: 0.1237 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:05:52,020] [INFO] [logging.py:107:log_dist] [Rank 0] step=2996, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2996 loss: 0.0356 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 04:06:02,694] [INFO] [logging.py:107:log_dist] [Rank 0] step=2997, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2997 loss: 0.1576 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:06:13,362] [INFO] [logging.py:107:log_dist] [Rank 0] step=2998, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2998 loss: 0.1466 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:06:24,221] [INFO] [logging.py:107:log_dist] [Rank 0] step=2999, skipped=0, lr=[1e-05], mom=[0.0] +steps: 2999 loss: 0.1516 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 04:06:34,892] [INFO] [logging.py:107:log_dist] [Rank 0] step=3000, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3000 loss: 0.0549 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:06:45,563] [INFO] [logging.py:107:log_dist] [Rank 0] step=3001, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3001 loss: 0.0801 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:06:56,233] [INFO] [logging.py:107:log_dist] [Rank 0] step=3002, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3002 loss: 0.1881 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:07:06,910] [INFO] [logging.py:107:log_dist] [Rank 0] step=3003, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3003 loss: 0.0332 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:07:17,579] [INFO] [logging.py:107:log_dist] [Rank 0] step=3004, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3004 loss: 0.0351 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:07:28,252] [INFO] [logging.py:107:log_dist] [Rank 0] step=3005, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3005 loss: 0.0636 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:07:38,925] [INFO] [logging.py:107:log_dist] [Rank 0] step=3006, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3006 loss: 0.0606 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:07:49,595] [INFO] [logging.py:107:log_dist] [Rank 0] step=3007, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3007 loss: 0.1226 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:08:00,464] [INFO] [logging.py:107:log_dist] [Rank 0] step=3008, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3008 loss: 0.0232 iter time (s): 10.838 samples/sec: 0.092 +[2025-05-06 04:08:11,139] [INFO] [logging.py:107:log_dist] [Rank 0] step=3009, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3009 loss: 0.0870 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 04:08:21,811] [INFO] [logging.py:107:log_dist] [Rank 0] step=3010, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3010 loss: 0.0495 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:08:32,483] [INFO] [logging.py:107:log_dist] [Rank 0] step=3011, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3011 loss: 0.0386 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:08:43,153] [INFO] [logging.py:107:log_dist] [Rank 0] step=3012, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3012 loss: 0.0708 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:08:53,823] [INFO] [logging.py:107:log_dist] [Rank 0] step=3013, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3013 loss: 0.0650 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:09:04,504] [INFO] [logging.py:107:log_dist] [Rank 0] step=3014, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3014 loss: 0.0484 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-06 04:09:15,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=3015, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3015 loss: 0.0694 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:09:26,054] [INFO] [logging.py:107:log_dist] [Rank 0] step=3016, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3016 loss: 0.1175 iter time (s): 10.845 samples/sec: 0.092 +[2025-05-06 04:09:36,724] [INFO] [logging.py:107:log_dist] [Rank 0] step=3017, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3017 loss: 0.0394 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:09:47,407] [INFO] [logging.py:107:log_dist] [Rank 0] step=3018, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3018 loss: 0.0472 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:09:58,082] [INFO] [logging.py:107:log_dist] [Rank 0] step=3019, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3019 loss: 0.0341 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 04:10:08,757] [INFO] [logging.py:107:log_dist] [Rank 0] step=3020, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3020 loss: 0.0728 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 04:10:19,429] [INFO] [logging.py:107:log_dist] [Rank 0] step=3021, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3021 loss: 0.1071 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:10:30,099] [INFO] [logging.py:107:log_dist] [Rank 0] step=3022, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3022 loss: 0.0703 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:10:40,773] [INFO] [logging.py:107:log_dist] [Rank 0] step=3023, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3023 loss: 0.0391 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:10:51,444] [INFO] [logging.py:107:log_dist] [Rank 0] step=3024, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3024 loss: 0.1818 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:11:02,272] [INFO] [logging.py:107:log_dist] [Rank 0] step=3025, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3025 loss: 0.1175 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-06 04:11:12,959] [INFO] [logging.py:107:log_dist] [Rank 0] step=3026, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3026 loss: 0.0533 iter time (s): 10.656 samples/sec: 0.094 +[2025-05-06 04:11:23,634] [INFO] [logging.py:107:log_dist] [Rank 0] step=3027, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3027 loss: 0.0303 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:11:34,303] [INFO] [logging.py:107:log_dist] [Rank 0] step=3028, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3028 loss: 0.0325 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:11:44,980] [INFO] [logging.py:107:log_dist] [Rank 0] step=3029, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3029 loss: 0.0341 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:11:55,653] [INFO] [logging.py:107:log_dist] [Rank 0] step=3030, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3030 loss: 0.1401 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:12:06,324] [INFO] [logging.py:107:log_dist] [Rank 0] step=3031, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3031 loss: 0.0595 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:12:16,993] [INFO] [logging.py:107:log_dist] [Rank 0] step=3032, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3032 loss: 0.0347 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:12:27,663] [INFO] [logging.py:107:log_dist] [Rank 0] step=3033, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3033 loss: 0.0389 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:12:38,489] [INFO] [logging.py:107:log_dist] [Rank 0] step=3034, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3034 loss: 0.0674 iter time (s): 10.800 samples/sec: 0.093 +Started new epoch: 75 +[2025-05-06 04:12:49,495] [INFO] [logging.py:107:log_dist] [Rank 0] step=3035, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3035 loss: 0.0361 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:13:00,162] [INFO] [logging.py:107:log_dist] [Rank 0] step=3036, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3036 loss: 0.0744 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:13:10,831] [INFO] [logging.py:107:log_dist] [Rank 0] step=3037, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3037 loss: 0.0464 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:13:21,504] [INFO] [logging.py:107:log_dist] [Rank 0] step=3038, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3038 loss: 0.1529 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:13:32,174] [INFO] [logging.py:107:log_dist] [Rank 0] step=3039, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3039 loss: 0.0319 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:13:42,843] [INFO] [logging.py:107:log_dist] [Rank 0] step=3040, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3040 loss: 0.0348 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:13:53,515] [INFO] [logging.py:107:log_dist] [Rank 0] step=3041, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3041 loss: 0.1074 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:14:04,182] [INFO] [logging.py:107:log_dist] [Rank 0] step=3042, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3042 loss: 0.1031 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 04:14:15,015] [INFO] [logging.py:107:log_dist] [Rank 0] step=3043, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3043 loss: 0.0529 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-06 04:14:25,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=3044, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3044 loss: 0.0539 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:14:36,356] [INFO] [logging.py:107:log_dist] [Rank 0] step=3045, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3045 loss: 0.0535 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:14:47,022] [INFO] [logging.py:107:log_dist] [Rank 0] step=3046, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3046 loss: 0.1372 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 04:14:57,695] [INFO] [logging.py:107:log_dist] [Rank 0] step=3047, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3047 loss: 0.1534 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:15:08,381] [INFO] [logging.py:107:log_dist] [Rank 0] step=3048, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3048 loss: 0.0269 iter time (s): 10.656 samples/sec: 0.094 +[2025-05-06 04:15:19,100] [INFO] [logging.py:107:log_dist] [Rank 0] step=3049, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3049 loss: 0.2060 iter time (s): 10.688 samples/sec: 0.094 +[2025-05-06 04:15:29,846] [INFO] [logging.py:107:log_dist] [Rank 0] step=3050, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3050 loss: 0.1554 iter time (s): 10.709 samples/sec: 0.093 +[2025-05-06 04:15:40,763] [INFO] [logging.py:107:log_dist] [Rank 0] step=3051, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3051 loss: 0.0300 iter time (s): 10.885 samples/sec: 0.092 +[2025-05-06 04:15:51,462] [INFO] [logging.py:107:log_dist] [Rank 0] step=3052, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3052 loss: 0.0808 iter time (s): 10.667 samples/sec: 0.094 +[2025-05-06 04:16:02,125] [INFO] [logging.py:107:log_dist] [Rank 0] step=3053, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3053 loss: 0.0504 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 04:16:12,796] [INFO] [logging.py:107:log_dist] [Rank 0] step=3054, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3054 loss: 0.0447 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 04:16:23,457] [INFO] [logging.py:107:log_dist] [Rank 0] step=3055, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3055 loss: 0.0634 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:16:34,122] [INFO] [logging.py:107:log_dist] [Rank 0] step=3056, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3056 loss: 0.0926 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 04:16:44,782] [INFO] [logging.py:107:log_dist] [Rank 0] step=3057, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3057 loss: 0.1767 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 04:16:55,443] [INFO] [logging.py:107:log_dist] [Rank 0] step=3058, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3058 loss: 0.0331 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:17:06,107] [INFO] [logging.py:107:log_dist] [Rank 0] step=3059, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3059 loss: 0.1023 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 04:17:16,955] [INFO] [logging.py:107:log_dist] [Rank 0] step=3060, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3060 loss: 0.0358 iter time (s): 10.817 samples/sec: 0.092 +[2025-05-06 04:17:27,616] [INFO] [logging.py:107:log_dist] [Rank 0] step=3061, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3061 loss: 0.0660 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:17:38,278] [INFO] [logging.py:107:log_dist] [Rank 0] step=3062, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3062 loss: 0.0967 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:17:48,941] [INFO] [logging.py:107:log_dist] [Rank 0] step=3063, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3063 loss: 0.3721 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 04:17:59,613] [INFO] [logging.py:107:log_dist] [Rank 0] step=3064, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3064 loss: 0.0705 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:18:10,278] [INFO] [logging.py:107:log_dist] [Rank 0] step=3065, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3065 loss: 0.0443 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:18:20,935] [INFO] [logging.py:107:log_dist] [Rank 0] step=3066, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3066 loss: 0.2068 iter time (s): 10.626 samples/sec: 0.094 +[2025-05-06 04:18:31,599] [INFO] [logging.py:107:log_dist] [Rank 0] step=3067, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3067 loss: 0.0301 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 04:18:42,262] [INFO] [logging.py:107:log_dist] [Rank 0] step=3068, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3068 loss: 0.0471 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:18:53,088] [INFO] [logging.py:107:log_dist] [Rank 0] step=3069, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3069 loss: 0.0614 iter time (s): 10.795 samples/sec: 0.093 +[2025-05-06 04:19:03,766] [INFO] [logging.py:107:log_dist] [Rank 0] step=3070, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3070 loss: 0.0408 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 04:19:14,429] [INFO] [logging.py:107:log_dist] [Rank 0] step=3071, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3071 loss: 0.0476 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:19:25,086] [INFO] [logging.py:107:log_dist] [Rank 0] step=3072, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3072 loss: 0.0870 iter time (s): 10.626 samples/sec: 0.094 +[2025-05-06 04:19:35,747] [INFO] [logging.py:107:log_dist] [Rank 0] step=3073, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3073 loss: 0.0490 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 04:19:46,410] [INFO] [logging.py:107:log_dist] [Rank 0] step=3074, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3074 loss: 0.0961 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 04:19:57,068] [INFO] [logging.py:107:log_dist] [Rank 0] step=3075, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3075 loss: 0.0443 iter time (s): 10.632 samples/sec: 0.094 +Started new epoch: 76 +[2025-05-06 04:20:08,072] [INFO] [logging.py:107:log_dist] [Rank 0] step=3076, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3076 loss: 0.0318 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:20:18,919] [INFO] [logging.py:107:log_dist] [Rank 0] step=3077, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3077 loss: 0.2213 iter time (s): 10.816 samples/sec: 0.092 +[2025-05-06 04:20:29,574] [INFO] [logging.py:107:log_dist] [Rank 0] step=3078, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3078 loss: 0.1257 iter time (s): 10.624 samples/sec: 0.094 +[2025-05-06 04:20:40,237] [INFO] [logging.py:107:log_dist] [Rank 0] step=3079, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3079 loss: 0.0704 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 04:20:50,894] [INFO] [logging.py:107:log_dist] [Rank 0] step=3080, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3080 loss: 0.0451 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 04:21:01,553] [INFO] [logging.py:107:log_dist] [Rank 0] step=3081, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3081 loss: 0.0735 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 04:21:12,214] [INFO] [logging.py:107:log_dist] [Rank 0] step=3082, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3082 loss: 0.2526 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:21:22,871] [INFO] [logging.py:107:log_dist] [Rank 0] step=3083, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3083 loss: 0.0499 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 04:21:33,529] [INFO] [logging.py:107:log_dist] [Rank 0] step=3084, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3084 loss: 0.0318 iter time (s): 10.626 samples/sec: 0.094 +[2025-05-06 04:21:44,191] [INFO] [logging.py:107:log_dist] [Rank 0] step=3085, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3085 loss: 0.0376 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:21:55,044] [INFO] [logging.py:107:log_dist] [Rank 0] step=3086, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3086 loss: 0.0904 iter time (s): 10.822 samples/sec: 0.092 +[2025-05-06 04:22:05,708] [INFO] [logging.py:107:log_dist] [Rank 0] step=3087, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3087 loss: 0.4430 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 04:22:16,375] [INFO] [logging.py:107:log_dist] [Rank 0] step=3088, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3088 loss: 0.0378 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:22:27,037] [INFO] [logging.py:107:log_dist] [Rank 0] step=3089, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3089 loss: 0.0646 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 04:22:37,710] [INFO] [logging.py:107:log_dist] [Rank 0] step=3090, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3090 loss: 0.0280 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 04:22:48,377] [INFO] [logging.py:107:log_dist] [Rank 0] step=3091, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3091 loss: 0.1713 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:22:59,034] [INFO] [logging.py:107:log_dist] [Rank 0] step=3092, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3092 loss: 0.0316 iter time (s): 10.625 samples/sec: 0.094 +[2025-05-06 04:23:09,695] [INFO] [logging.py:107:log_dist] [Rank 0] step=3093, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3093 loss: 0.0594 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:23:20,358] [INFO] [logging.py:107:log_dist] [Rank 0] step=3094, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3094 loss: 0.0762 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 04:23:31,174] [INFO] [logging.py:107:log_dist] [Rank 0] step=3095, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3095 loss: 0.0584 iter time (s): 10.784 samples/sec: 0.093 +[2025-05-06 04:23:41,837] [INFO] [logging.py:107:log_dist] [Rank 0] step=3096, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3096 loss: 0.0293 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:23:52,501] [INFO] [logging.py:107:log_dist] [Rank 0] step=3097, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3097 loss: 0.0497 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 04:24:03,160] [INFO] [logging.py:107:log_dist] [Rank 0] step=3098, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3098 loss: 0.0555 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 04:24:13,827] [INFO] [logging.py:107:log_dist] [Rank 0] step=3099, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3099 loss: 0.0573 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:24:24,491] [INFO] [logging.py:107:log_dist] [Rank 0] step=3100, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3100 loss: 0.1258 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 04:24:35,152] [INFO] [logging.py:107:log_dist] [Rank 0] step=3101, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3101 loss: 0.2586 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:24:45,812] [INFO] [logging.py:107:log_dist] [Rank 0] step=3102, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3102 loss: 0.0805 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:24:56,646] [INFO] [logging.py:107:log_dist] [Rank 0] step=3103, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3103 loss: 0.0393 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-06 04:25:07,308] [INFO] [logging.py:107:log_dist] [Rank 0] step=3104, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3104 loss: 0.0395 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:25:17,967] [INFO] [logging.py:107:log_dist] [Rank 0] step=3105, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3105 loss: 0.0935 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 04:25:28,631] [INFO] [logging.py:107:log_dist] [Rank 0] step=3106, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3106 loss: 0.1061 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 04:25:39,289] [INFO] [logging.py:107:log_dist] [Rank 0] step=3107, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3107 loss: 0.1733 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 04:25:49,949] [INFO] [logging.py:107:log_dist] [Rank 0] step=3108, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3108 loss: 0.1028 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 04:26:00,609] [INFO] [logging.py:107:log_dist] [Rank 0] step=3109, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3109 loss: 0.1007 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:26:11,272] [INFO] [logging.py:107:log_dist] [Rank 0] step=3110, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3110 loss: 0.0574 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 04:26:21,932] [INFO] [logging.py:107:log_dist] [Rank 0] step=3111, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3111 loss: 0.0785 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 04:26:32,752] [INFO] [logging.py:107:log_dist] [Rank 0] step=3112, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3112 loss: 0.0320 iter time (s): 10.791 samples/sec: 0.093 +[2025-05-06 04:26:43,411] [INFO] [logging.py:107:log_dist] [Rank 0] step=3113, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3113 loss: 0.0385 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 04:26:54,073] [INFO] [logging.py:107:log_dist] [Rank 0] step=3114, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3114 loss: 0.1881 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:27:04,737] [INFO] [logging.py:107:log_dist] [Rank 0] step=3115, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3115 loss: 0.1042 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 04:27:15,391] [INFO] [logging.py:107:log_dist] [Rank 0] step=3116, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3116 loss: 0.0481 iter time (s): 10.627 samples/sec: 0.094 +Started new epoch: 77 +[2025-05-06 04:27:26,391] [INFO] [logging.py:107:log_dist] [Rank 0] step=3117, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3117 loss: 0.0595 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:27:37,051] [INFO] [logging.py:107:log_dist] [Rank 0] step=3118, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3118 loss: 0.0284 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:27:47,714] [INFO] [logging.py:107:log_dist] [Rank 0] step=3119, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3119 loss: 0.0470 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:27:58,562] [INFO] [logging.py:107:log_dist] [Rank 0] step=3120, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3120 loss: 0.0481 iter time (s): 10.817 samples/sec: 0.092 +[2025-05-06 04:28:09,228] [INFO] [logging.py:107:log_dist] [Rank 0] step=3121, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3121 loss: 0.0470 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 04:28:19,886] [INFO] [logging.py:107:log_dist] [Rank 0] step=3122, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3122 loss: 0.0420 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 04:28:30,555] [INFO] [logging.py:107:log_dist] [Rank 0] step=3123, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3123 loss: 0.1046 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:28:41,213] [INFO] [logging.py:107:log_dist] [Rank 0] step=3124, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3124 loss: 0.1051 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 04:28:51,878] [INFO] [logging.py:107:log_dist] [Rank 0] step=3125, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3125 loss: 0.1210 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 04:29:02,537] [INFO] [logging.py:107:log_dist] [Rank 0] step=3126, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3126 loss: 0.3435 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 04:29:13,208] [INFO] [logging.py:107:log_dist] [Rank 0] step=3127, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3127 loss: 0.0839 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 04:29:23,866] [INFO] [logging.py:107:log_dist] [Rank 0] step=3128, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3128 loss: 0.1924 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 04:29:34,718] [INFO] [logging.py:107:log_dist] [Rank 0] step=3129, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3129 loss: 0.1869 iter time (s): 10.820 samples/sec: 0.092 +[2025-05-06 04:29:45,383] [INFO] [logging.py:107:log_dist] [Rank 0] step=3130, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3130 loss: 0.0299 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 04:29:56,042] [INFO] [logging.py:107:log_dist] [Rank 0] step=3131, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3131 loss: 0.0894 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 04:30:06,710] [INFO] [logging.py:107:log_dist] [Rank 0] step=3132, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3132 loss: 0.0637 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:30:17,367] [INFO] [logging.py:107:log_dist] [Rank 0] step=3133, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3133 loss: 0.0576 iter time (s): 10.626 samples/sec: 0.094 +[2025-05-06 04:30:28,029] [INFO] [logging.py:107:log_dist] [Rank 0] step=3134, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3134 loss: 0.0494 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:30:38,694] [INFO] [logging.py:107:log_dist] [Rank 0] step=3135, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3135 loss: 0.0310 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 04:30:49,356] [INFO] [logging.py:107:log_dist] [Rank 0] step=3136, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3136 loss: 0.3437 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:31:00,015] [INFO] [logging.py:107:log_dist] [Rank 0] step=3137, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3137 loss: 0.0322 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 04:31:10,837] [INFO] [logging.py:107:log_dist] [Rank 0] step=3138, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3138 loss: 0.0557 iter time (s): 10.790 samples/sec: 0.093 +[2025-05-06 04:31:21,497] [INFO] [logging.py:107:log_dist] [Rank 0] step=3139, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3139 loss: 0.0417 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 04:31:32,158] [INFO] [logging.py:107:log_dist] [Rank 0] step=3140, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3140 loss: 0.0786 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:31:42,824] [INFO] [logging.py:107:log_dist] [Rank 0] step=3141, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3141 loss: 0.0457 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 04:31:53,487] [INFO] [logging.py:107:log_dist] [Rank 0] step=3142, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3142 loss: 0.1351 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 04:32:04,149] [INFO] [logging.py:107:log_dist] [Rank 0] step=3143, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3143 loss: 0.0355 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:32:14,821] [INFO] [logging.py:107:log_dist] [Rank 0] step=3144, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3144 loss: 0.0393 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 04:32:25,482] [INFO] [logging.py:107:log_dist] [Rank 0] step=3145, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3145 loss: 0.0502 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:32:36,329] [INFO] [logging.py:107:log_dist] [Rank 0] step=3146, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3146 loss: 0.0896 iter time (s): 10.817 samples/sec: 0.092 +[2025-05-06 04:32:46,992] [INFO] [logging.py:107:log_dist] [Rank 0] step=3147, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3147 loss: 0.0532 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 04:32:57,652] [INFO] [logging.py:107:log_dist] [Rank 0] step=3148, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3148 loss: 0.0547 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 04:33:08,323] [INFO] [logging.py:107:log_dist] [Rank 0] step=3149, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3149 loss: 0.0486 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:33:18,994] [INFO] [logging.py:107:log_dist] [Rank 0] step=3150, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3150 loss: 0.3525 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:33:29,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=3151, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3151 loss: 0.0349 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 04:33:40,347] [INFO] [logging.py:107:log_dist] [Rank 0] step=3152, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3152 loss: 0.0912 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:33:51,018] [INFO] [logging.py:107:log_dist] [Rank 0] step=3153, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3153 loss: 0.2082 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:34:01,685] [INFO] [logging.py:107:log_dist] [Rank 0] step=3154, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3154 loss: 0.0995 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:34:12,548] [INFO] [logging.py:107:log_dist] [Rank 0] step=3155, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3155 loss: 0.1563 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-06 04:34:23,216] [INFO] [logging.py:107:log_dist] [Rank 0] step=3156, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3156 loss: 0.0314 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:34:33,882] [INFO] [logging.py:107:log_dist] [Rank 0] step=3157, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3157 loss: 0.2511 iter time (s): 10.639 samples/sec: 0.094 +Started new epoch: 78 +[2025-05-06 04:34:44,890] [INFO] [logging.py:107:log_dist] [Rank 0] step=3158, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3158 loss: 0.1198 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:34:55,567] [INFO] [logging.py:107:log_dist] [Rank 0] step=3159, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3159 loss: 0.0766 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:35:06,241] [INFO] [logging.py:107:log_dist] [Rank 0] step=3160, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3160 loss: 0.0486 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:35:16,909] [INFO] [logging.py:107:log_dist] [Rank 0] step=3161, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3161 loss: 0.0348 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:35:27,584] [INFO] [logging.py:107:log_dist] [Rank 0] step=3162, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3162 loss: 0.0690 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 04:35:38,260] [INFO] [logging.py:107:log_dist] [Rank 0] step=3163, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3163 loss: 0.2020 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 04:35:49,086] [INFO] [logging.py:107:log_dist] [Rank 0] step=3164, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3164 loss: 0.0604 iter time (s): 10.796 samples/sec: 0.093 +[2025-05-06 04:35:59,759] [INFO] [logging.py:107:log_dist] [Rank 0] step=3165, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3165 loss: 0.0657 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:36:10,430] [INFO] [logging.py:107:log_dist] [Rank 0] step=3166, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3166 loss: 0.0677 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:36:21,111] [INFO] [logging.py:107:log_dist] [Rank 0] step=3167, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3167 loss: 0.1596 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 04:36:31,792] [INFO] [logging.py:107:log_dist] [Rank 0] step=3168, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3168 loss: 0.1003 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 04:36:42,465] [INFO] [logging.py:107:log_dist] [Rank 0] step=3169, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3169 loss: 0.0893 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:36:53,135] [INFO] [logging.py:107:log_dist] [Rank 0] step=3170, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3170 loss: 0.0529 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:37:03,812] [INFO] [logging.py:107:log_dist] [Rank 0] step=3171, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3171 loss: 0.0347 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 04:37:14,646] [INFO] [logging.py:107:log_dist] [Rank 0] step=3172, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3172 loss: 0.0281 iter time (s): 10.803 samples/sec: 0.093 +[2025-05-06 04:37:25,316] [INFO] [logging.py:107:log_dist] [Rank 0] step=3173, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3173 loss: 0.0459 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:37:35,997] [INFO] [logging.py:107:log_dist] [Rank 0] step=3174, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3174 loss: 0.1608 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:37:46,671] [INFO] [logging.py:107:log_dist] [Rank 0] step=3175, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3175 loss: 0.1481 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:37:57,356] [INFO] [logging.py:107:log_dist] [Rank 0] step=3176, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3176 loss: 0.2497 iter time (s): 10.654 samples/sec: 0.094 +[2025-05-06 04:38:08,037] [INFO] [logging.py:107:log_dist] [Rank 0] step=3177, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3177 loss: 0.0877 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-06 04:38:18,707] [INFO] [logging.py:107:log_dist] [Rank 0] step=3178, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3178 loss: 0.0921 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:38:29,376] [INFO] [logging.py:107:log_dist] [Rank 0] step=3179, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3179 loss: 0.0286 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:38:40,060] [INFO] [logging.py:107:log_dist] [Rank 0] step=3180, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3180 loss: 0.0855 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:38:50,903] [INFO] [logging.py:107:log_dist] [Rank 0] step=3181, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3181 loss: 0.4557 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-06 04:39:01,577] [INFO] [logging.py:107:log_dist] [Rank 0] step=3182, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3182 loss: 0.0387 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 04:39:12,251] [INFO] [logging.py:107:log_dist] [Rank 0] step=3183, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3183 loss: 0.0904 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:39:22,922] [INFO] [logging.py:107:log_dist] [Rank 0] step=3184, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3184 loss: 0.2877 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:39:33,596] [INFO] [logging.py:107:log_dist] [Rank 0] step=3185, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3185 loss: 0.0341 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:39:44,267] [INFO] [logging.py:107:log_dist] [Rank 0] step=3186, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3186 loss: 0.0525 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:39:54,939] [INFO] [logging.py:107:log_dist] [Rank 0] step=3187, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3187 loss: 0.0382 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:40:05,619] [INFO] [logging.py:107:log_dist] [Rank 0] step=3188, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3188 loss: 0.0292 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 04:40:16,478] [INFO] [logging.py:107:log_dist] [Rank 0] step=3189, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3189 loss: 0.0340 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 04:40:27,149] [INFO] [logging.py:107:log_dist] [Rank 0] step=3190, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3190 loss: 0.0348 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:40:37,829] [INFO] [logging.py:107:log_dist] [Rank 0] step=3191, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3191 loss: 0.0386 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 04:40:48,505] [INFO] [logging.py:107:log_dist] [Rank 0] step=3192, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3192 loss: 0.0342 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 04:40:59,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=3193, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3193 loss: 0.1028 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:41:09,849] [INFO] [logging.py:107:log_dist] [Rank 0] step=3194, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3194 loss: 0.0451 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:41:20,519] [INFO] [logging.py:107:log_dist] [Rank 0] step=3195, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3195 loss: 0.0585 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:41:31,187] [INFO] [logging.py:107:log_dist] [Rank 0] step=3196, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3196 loss: 0.0574 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:41:41,861] [INFO] [logging.py:107:log_dist] [Rank 0] step=3197, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3197 loss: 0.0598 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 04:41:52,723] [INFO] [logging.py:107:log_dist] [Rank 0] step=3198, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3198 loss: 0.1558 iter time (s): 10.835 samples/sec: 0.092 +Started new epoch: 79 +[2025-05-06 04:42:03,727] [INFO] [logging.py:107:log_dist] [Rank 0] step=3199, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3199 loss: 0.1123 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:42:14,406] [INFO] [logging.py:107:log_dist] [Rank 0] step=3200, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3200 loss: 0.0633 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:42:25,076] [INFO] [logging.py:107:log_dist] [Rank 0] step=3201, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3201 loss: 0.0674 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:42:35,745] [INFO] [logging.py:107:log_dist] [Rank 0] step=3202, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3202 loss: 0.3125 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:42:46,419] [INFO] [logging.py:107:log_dist] [Rank 0] step=3203, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3203 loss: 0.0323 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:42:57,088] [INFO] [logging.py:107:log_dist] [Rank 0] step=3204, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3204 loss: 0.0707 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:43:07,759] [INFO] [logging.py:107:log_dist] [Rank 0] step=3205, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3205 loss: 0.0538 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:43:18,434] [INFO] [logging.py:107:log_dist] [Rank 0] step=3206, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3206 loss: 0.1067 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:43:29,265] [INFO] [logging.py:107:log_dist] [Rank 0] step=3207, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3207 loss: 0.0815 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-06 04:43:39,933] [INFO] [logging.py:107:log_dist] [Rank 0] step=3208, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3208 loss: 0.0792 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 04:43:50,607] [INFO] [logging.py:107:log_dist] [Rank 0] step=3209, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3209 loss: 0.2075 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:44:01,279] [INFO] [logging.py:107:log_dist] [Rank 0] step=3210, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3210 loss: 0.0260 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:44:11,949] [INFO] [logging.py:107:log_dist] [Rank 0] step=3211, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3211 loss: 0.1272 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:44:22,622] [INFO] [logging.py:107:log_dist] [Rank 0] step=3212, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3212 loss: 0.1052 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:44:33,292] [INFO] [logging.py:107:log_dist] [Rank 0] step=3213, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3213 loss: 0.0255 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:44:43,965] [INFO] [logging.py:107:log_dist] [Rank 0] step=3214, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3214 loss: 0.1015 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:44:54,824] [INFO] [logging.py:107:log_dist] [Rank 0] step=3215, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3215 loss: 0.0682 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-06 04:45:05,500] [INFO] [logging.py:107:log_dist] [Rank 0] step=3216, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3216 loss: 0.0379 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:45:16,170] [INFO] [logging.py:107:log_dist] [Rank 0] step=3217, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3217 loss: 0.1214 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:45:26,844] [INFO] [logging.py:107:log_dist] [Rank 0] step=3218, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3218 loss: 0.0303 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:45:37,514] [INFO] [logging.py:107:log_dist] [Rank 0] step=3219, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3219 loss: 0.0253 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:45:48,184] [INFO] [logging.py:107:log_dist] [Rank 0] step=3220, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3220 loss: 0.1729 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:45:58,856] [INFO] [logging.py:107:log_dist] [Rank 0] step=3221, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3221 loss: 0.2186 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:46:09,527] [INFO] [logging.py:107:log_dist] [Rank 0] step=3222, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3222 loss: 0.1381 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:46:20,196] [INFO] [logging.py:107:log_dist] [Rank 0] step=3223, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3223 loss: 0.0323 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:46:31,063] [INFO] [logging.py:107:log_dist] [Rank 0] step=3224, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3224 loss: 0.0362 iter time (s): 10.836 samples/sec: 0.092 +[2025-05-06 04:46:41,738] [INFO] [logging.py:107:log_dist] [Rank 0] step=3225, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3225 loss: 0.0397 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:46:52,405] [INFO] [logging.py:107:log_dist] [Rank 0] step=3226, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3226 loss: 0.0584 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:47:03,078] [INFO] [logging.py:107:log_dist] [Rank 0] step=3227, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3227 loss: 0.0880 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:47:13,755] [INFO] [logging.py:107:log_dist] [Rank 0] step=3228, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3228 loss: 0.0421 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:47:24,431] [INFO] [logging.py:107:log_dist] [Rank 0] step=3229, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3229 loss: 0.0344 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:47:35,106] [INFO] [logging.py:107:log_dist] [Rank 0] step=3230, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3230 loss: 0.0409 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:47:45,777] [INFO] [logging.py:107:log_dist] [Rank 0] step=3231, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3231 loss: 0.1933 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:47:56,454] [INFO] [logging.py:107:log_dist] [Rank 0] step=3232, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3232 loss: 0.1133 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 04:48:07,287] [INFO] [logging.py:107:log_dist] [Rank 0] step=3233, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3233 loss: 0.1058 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-06 04:48:17,961] [INFO] [logging.py:107:log_dist] [Rank 0] step=3234, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3234 loss: 0.0283 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:48:28,630] [INFO] [logging.py:107:log_dist] [Rank 0] step=3235, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3235 loss: 0.0936 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:48:39,302] [INFO] [logging.py:107:log_dist] [Rank 0] step=3236, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3236 loss: 0.0264 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:48:49,971] [INFO] [logging.py:107:log_dist] [Rank 0] step=3237, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3237 loss: 0.3330 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:49:00,647] [INFO] [logging.py:107:log_dist] [Rank 0] step=3238, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3238 loss: 0.0838 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 04:49:11,319] [INFO] [logging.py:107:log_dist] [Rank 0] step=3239, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3239 loss: 0.1833 iter time (s): 10.645 samples/sec: 0.094 +Started new epoch: 80 +[2025-05-06 04:49:22,341] [INFO] [logging.py:107:log_dist] [Rank 0] step=3240, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3240 loss: 0.1795 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:49:33,179] [INFO] [logging.py:107:log_dist] [Rank 0] step=3241, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3241 loss: 0.0541 iter time (s): 10.807 samples/sec: 0.093 +[2025-05-06 04:49:43,854] [INFO] [logging.py:107:log_dist] [Rank 0] step=3242, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3242 loss: 0.0447 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:49:54,525] [INFO] [logging.py:107:log_dist] [Rank 0] step=3243, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3243 loss: 0.1310 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:50:05,203] [INFO] [logging.py:107:log_dist] [Rank 0] step=3244, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3244 loss: 0.0405 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 04:50:15,873] [INFO] [logging.py:107:log_dist] [Rank 0] step=3245, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3245 loss: 0.0336 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:50:26,541] [INFO] [logging.py:107:log_dist] [Rank 0] step=3246, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3246 loss: 0.0955 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:50:37,219] [INFO] [logging.py:107:log_dist] [Rank 0] step=3247, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3247 loss: 0.0545 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 04:50:47,888] [INFO] [logging.py:107:log_dist] [Rank 0] step=3248, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3248 loss: 0.1116 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:50:58,562] [INFO] [logging.py:107:log_dist] [Rank 0] step=3249, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3249 loss: 0.0599 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:51:09,399] [INFO] [logging.py:107:log_dist] [Rank 0] step=3250, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3250 loss: 0.0465 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-06 04:51:20,070] [INFO] [logging.py:107:log_dist] [Rank 0] step=3251, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3251 loss: 0.0334 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:51:30,744] [INFO] [logging.py:107:log_dist] [Rank 0] step=3252, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3252 loss: 0.0393 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:51:41,414] [INFO] [logging.py:107:log_dist] [Rank 0] step=3253, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3253 loss: 0.0976 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:51:52,084] [INFO] [logging.py:107:log_dist] [Rank 0] step=3254, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3254 loss: 0.0269 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:52:02,755] [INFO] [logging.py:107:log_dist] [Rank 0] step=3255, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3255 loss: 0.0595 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:52:13,428] [INFO] [logging.py:107:log_dist] [Rank 0] step=3256, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3256 loss: 0.2892 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:52:24,098] [INFO] [logging.py:107:log_dist] [Rank 0] step=3257, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3257 loss: 0.0280 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:52:34,953] [INFO] [logging.py:107:log_dist] [Rank 0] step=3258, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3258 loss: 0.1353 iter time (s): 10.823 samples/sec: 0.092 +[2025-05-06 04:52:45,629] [INFO] [logging.py:107:log_dist] [Rank 0] step=3259, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3259 loss: 0.0829 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:52:56,297] [INFO] [logging.py:107:log_dist] [Rank 0] step=3260, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3260 loss: 0.0718 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 04:53:06,971] [INFO] [logging.py:107:log_dist] [Rank 0] step=3261, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3261 loss: 0.0857 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:53:17,651] [INFO] [logging.py:107:log_dist] [Rank 0] step=3262, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3262 loss: 0.0403 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 04:53:28,323] [INFO] [logging.py:107:log_dist] [Rank 0] step=3263, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3263 loss: 0.0661 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:53:38,996] [INFO] [logging.py:107:log_dist] [Rank 0] step=3264, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3264 loss: 0.1352 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:53:49,673] [INFO] [logging.py:107:log_dist] [Rank 0] step=3265, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3265 loss: 0.0366 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 04:54:00,346] [INFO] [logging.py:107:log_dist] [Rank 0] step=3266, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3266 loss: 0.0250 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:54:11,206] [INFO] [logging.py:107:log_dist] [Rank 0] step=3267, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3267 loss: 0.0414 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 04:54:21,878] [INFO] [logging.py:107:log_dist] [Rank 0] step=3268, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3268 loss: 0.1093 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:54:32,556] [INFO] [logging.py:107:log_dist] [Rank 0] step=3269, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3269 loss: 0.3000 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 04:54:43,228] [INFO] [logging.py:107:log_dist] [Rank 0] step=3270, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3270 loss: 0.0370 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:54:53,909] [INFO] [logging.py:107:log_dist] [Rank 0] step=3271, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3271 loss: 0.0525 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:55:04,584] [INFO] [logging.py:107:log_dist] [Rank 0] step=3272, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3272 loss: 0.0492 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 04:55:15,257] [INFO] [logging.py:107:log_dist] [Rank 0] step=3273, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3273 loss: 0.0572 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:55:25,934] [INFO] [logging.py:107:log_dist] [Rank 0] step=3274, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3274 loss: 0.0386 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 04:55:36,609] [INFO] [logging.py:107:log_dist] [Rank 0] step=3275, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3275 loss: 0.1142 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:55:47,437] [INFO] [logging.py:107:log_dist] [Rank 0] step=3276, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3276 loss: 0.0739 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 04:55:58,109] [INFO] [logging.py:107:log_dist] [Rank 0] step=3277, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3277 loss: 0.0477 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:56:08,779] [INFO] [logging.py:107:log_dist] [Rank 0] step=3278, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3278 loss: 0.1088 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:56:19,446] [INFO] [logging.py:107:log_dist] [Rank 0] step=3279, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3279 loss: 0.0511 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:56:30,111] [INFO] [logging.py:107:log_dist] [Rank 0] step=3280, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3280 loss: 0.0431 iter time (s): 10.638 samples/sec: 0.094 +Saving model to directory epoch80 +Started new epoch: 81 +[2025-05-06 04:56:42,638] [INFO] [logging.py:107:log_dist] [Rank 0] step=3281, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3281 loss: 0.0990 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 04:56:53,315] [INFO] [logging.py:107:log_dist] [Rank 0] step=3282, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3282 loss: 0.0401 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:57:03,986] [INFO] [logging.py:107:log_dist] [Rank 0] step=3283, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3283 loss: 0.0497 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:57:14,816] [INFO] [logging.py:107:log_dist] [Rank 0] step=3284, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3284 loss: 0.0309 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 04:57:25,492] [INFO] [logging.py:107:log_dist] [Rank 0] step=3285, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3285 loss: 0.0559 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:57:36,166] [INFO] [logging.py:107:log_dist] [Rank 0] step=3286, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3286 loss: 0.0399 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:57:46,837] [INFO] [logging.py:107:log_dist] [Rank 0] step=3287, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3287 loss: 0.0663 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:57:57,515] [INFO] [logging.py:107:log_dist] [Rank 0] step=3288, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3288 loss: 0.0309 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 04:58:08,186] [INFO] [logging.py:107:log_dist] [Rank 0] step=3289, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3289 loss: 0.0745 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 04:58:18,855] [INFO] [logging.py:107:log_dist] [Rank 0] step=3290, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3290 loss: 0.0277 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 04:58:29,532] [INFO] [logging.py:107:log_dist] [Rank 0] step=3291, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3291 loss: 0.1704 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 04:58:40,398] [INFO] [logging.py:107:log_dist] [Rank 0] step=3292, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3292 loss: 0.1048 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-06 04:58:51,072] [INFO] [logging.py:107:log_dist] [Rank 0] step=3293, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3293 loss: 0.0870 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 04:59:01,747] [INFO] [logging.py:107:log_dist] [Rank 0] step=3294, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3294 loss: 0.0755 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 04:59:12,419] [INFO] [logging.py:107:log_dist] [Rank 0] step=3295, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3295 loss: 0.0388 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 04:59:23,089] [INFO] [logging.py:107:log_dist] [Rank 0] step=3296, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3296 loss: 0.0864 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 04:59:33,767] [INFO] [logging.py:107:log_dist] [Rank 0] step=3297, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3297 loss: 0.0942 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 04:59:44,437] [INFO] [logging.py:107:log_dist] [Rank 0] step=3298, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3298 loss: 0.0458 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 04:59:55,108] [INFO] [logging.py:107:log_dist] [Rank 0] step=3299, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3299 loss: 0.0382 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:00:05,783] [INFO] [logging.py:107:log_dist] [Rank 0] step=3300, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3300 loss: 0.0564 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:00:16,615] [INFO] [logging.py:107:log_dist] [Rank 0] step=3301, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3301 loss: 0.0768 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-06 05:00:27,286] [INFO] [logging.py:107:log_dist] [Rank 0] step=3302, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3302 loss: 0.0351 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:00:37,957] [INFO] [logging.py:107:log_dist] [Rank 0] step=3303, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3303 loss: 0.1643 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:00:48,626] [INFO] [logging.py:107:log_dist] [Rank 0] step=3304, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3304 loss: 0.0276 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:00:59,294] [INFO] [logging.py:107:log_dist] [Rank 0] step=3305, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3305 loss: 0.0349 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:01:09,965] [INFO] [logging.py:107:log_dist] [Rank 0] step=3306, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3306 loss: 0.0423 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:01:20,642] [INFO] [logging.py:107:log_dist] [Rank 0] step=3307, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3307 loss: 0.0259 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:01:31,313] [INFO] [logging.py:107:log_dist] [Rank 0] step=3308, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3308 loss: 0.0537 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:01:41,996] [INFO] [logging.py:107:log_dist] [Rank 0] step=3309, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3309 loss: 0.0588 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-06 05:01:52,854] [INFO] [logging.py:107:log_dist] [Rank 0] step=3310, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3310 loss: 0.1008 iter time (s): 10.827 samples/sec: 0.092 +[2025-05-06 05:02:03,527] [INFO] [logging.py:107:log_dist] [Rank 0] step=3311, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3311 loss: 0.0280 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:02:14,198] [INFO] [logging.py:107:log_dist] [Rank 0] step=3312, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3312 loss: 0.0949 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:02:24,880] [INFO] [logging.py:107:log_dist] [Rank 0] step=3313, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3313 loss: 0.0381 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-06 05:02:35,551] [INFO] [logging.py:107:log_dist] [Rank 0] step=3314, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3314 loss: 0.0401 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:02:46,225] [INFO] [logging.py:107:log_dist] [Rank 0] step=3315, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3315 loss: 0.0292 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:02:56,894] [INFO] [logging.py:107:log_dist] [Rank 0] step=3316, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3316 loss: 0.0453 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:03:07,561] [INFO] [logging.py:107:log_dist] [Rank 0] step=3317, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3317 loss: 0.0457 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 05:03:18,396] [INFO] [logging.py:107:log_dist] [Rank 0] step=3318, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3318 loss: 0.0310 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-06 05:03:29,068] [INFO] [logging.py:107:log_dist] [Rank 0] step=3319, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3319 loss: 0.0484 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:03:39,739] [INFO] [logging.py:107:log_dist] [Rank 0] step=3320, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3320 loss: 0.0516 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:03:50,403] [INFO] [logging.py:107:log_dist] [Rank 0] step=3321, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3321 loss: 0.0466 iter time (s): 10.638 samples/sec: 0.094 +Started new epoch: 82 +[2025-05-06 05:04:01,404] [INFO] [logging.py:107:log_dist] [Rank 0] step=3322, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3322 loss: 0.0482 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:04:12,073] [INFO] [logging.py:107:log_dist] [Rank 0] step=3323, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3323 loss: 0.0357 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:04:22,747] [INFO] [logging.py:107:log_dist] [Rank 0] step=3324, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3324 loss: 0.0678 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:04:33,418] [INFO] [logging.py:107:log_dist] [Rank 0] step=3325, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3325 loss: 0.1219 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:04:44,090] [INFO] [logging.py:107:log_dist] [Rank 0] step=3326, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3326 loss: 0.0486 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:04:54,929] [INFO] [logging.py:107:log_dist] [Rank 0] step=3327, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3327 loss: 0.0850 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-06 05:05:05,603] [INFO] [logging.py:107:log_dist] [Rank 0] step=3328, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3328 loss: 0.0877 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:05:16,275] [INFO] [logging.py:107:log_dist] [Rank 0] step=3329, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3329 loss: 0.0461 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:05:26,949] [INFO] [logging.py:107:log_dist] [Rank 0] step=3330, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3330 loss: 0.0424 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:05:37,626] [INFO] [logging.py:107:log_dist] [Rank 0] step=3331, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3331 loss: 0.0504 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:05:48,294] [INFO] [logging.py:107:log_dist] [Rank 0] step=3332, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3332 loss: 0.0354 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:05:58,971] [INFO] [logging.py:107:log_dist] [Rank 0] step=3333, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3333 loss: 0.0714 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:06:09,642] [INFO] [logging.py:107:log_dist] [Rank 0] step=3334, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3334 loss: 0.3115 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:06:20,520] [INFO] [logging.py:107:log_dist] [Rank 0] step=3335, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3335 loss: 0.0981 iter time (s): 10.847 samples/sec: 0.092 +[2025-05-06 05:06:31,191] [INFO] [logging.py:107:log_dist] [Rank 0] step=3336, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3336 loss: 0.0429 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:06:41,862] [INFO] [logging.py:107:log_dist] [Rank 0] step=3337, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3337 loss: 0.0278 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:06:52,541] [INFO] [logging.py:107:log_dist] [Rank 0] step=3338, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3338 loss: 0.4914 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 05:07:03,216] [INFO] [logging.py:107:log_dist] [Rank 0] step=3339, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3339 loss: 0.0482 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:07:13,886] [INFO] [logging.py:107:log_dist] [Rank 0] step=3340, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3340 loss: 0.2013 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:07:24,560] [INFO] [logging.py:107:log_dist] [Rank 0] step=3341, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3341 loss: 0.0588 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:07:35,226] [INFO] [logging.py:107:log_dist] [Rank 0] step=3342, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3342 loss: 0.0427 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 05:07:45,897] [INFO] [logging.py:107:log_dist] [Rank 0] step=3343, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3343 loss: 0.0434 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:07:56,737] [INFO] [logging.py:107:log_dist] [Rank 0] step=3344, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3344 loss: 0.0310 iter time (s): 10.810 samples/sec: 0.093 +[2025-05-06 05:08:07,408] [INFO] [logging.py:107:log_dist] [Rank 0] step=3345, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3345 loss: 0.0853 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:08:18,081] [INFO] [logging.py:107:log_dist] [Rank 0] step=3346, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3346 loss: 0.1278 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:08:28,755] [INFO] [logging.py:107:log_dist] [Rank 0] step=3347, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3347 loss: 0.0465 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:08:39,426] [INFO] [logging.py:107:log_dist] [Rank 0] step=3348, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3348 loss: 0.0268 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:08:50,100] [INFO] [logging.py:107:log_dist] [Rank 0] step=3349, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3349 loss: 0.0691 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:09:00,771] [INFO] [logging.py:107:log_dist] [Rank 0] step=3350, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3350 loss: 0.2559 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:09:11,442] [INFO] [logging.py:107:log_dist] [Rank 0] step=3351, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3351 loss: 0.0343 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:09:22,113] [INFO] [logging.py:107:log_dist] [Rank 0] step=3352, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3352 loss: 0.2773 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:09:32,956] [INFO] [logging.py:107:log_dist] [Rank 0] step=3353, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3353 loss: 0.2175 iter time (s): 10.812 samples/sec: 0.092 +[2025-05-06 05:09:43,626] [INFO] [logging.py:107:log_dist] [Rank 0] step=3354, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3354 loss: 0.0294 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:09:54,298] [INFO] [logging.py:107:log_dist] [Rank 0] step=3355, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3355 loss: 0.0325 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:10:04,976] [INFO] [logging.py:107:log_dist] [Rank 0] step=3356, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3356 loss: 0.0696 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 05:10:15,655] [INFO] [logging.py:107:log_dist] [Rank 0] step=3357, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3357 loss: 0.0493 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:10:26,323] [INFO] [logging.py:107:log_dist] [Rank 0] step=3358, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3358 loss: 0.1168 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:10:36,993] [INFO] [logging.py:107:log_dist] [Rank 0] step=3359, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3359 loss: 0.0357 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:10:47,664] [INFO] [logging.py:107:log_dist] [Rank 0] step=3360, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3360 loss: 0.0493 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:10:58,335] [INFO] [logging.py:107:log_dist] [Rank 0] step=3361, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3361 loss: 0.3648 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:11:09,166] [INFO] [logging.py:107:log_dist] [Rank 0] step=3362, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3362 loss: 0.0641 iter time (s): 10.804 samples/sec: 0.093 +Started new epoch: 83 +[2025-05-06 05:11:20,163] [INFO] [logging.py:107:log_dist] [Rank 0] step=3363, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3363 loss: 0.1595 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:11:30,833] [INFO] [logging.py:107:log_dist] [Rank 0] step=3364, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3364 loss: 0.1212 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:11:41,509] [INFO] [logging.py:107:log_dist] [Rank 0] step=3365, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3365 loss: 0.0451 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:11:52,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=3366, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3366 loss: 0.0466 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:11:52,180] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step3366 is about to be saved! +[2025-05-06 05:11:52,181] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_00-model_states.pt... +[2025-05-06 05:11:52,182] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_00-model_states.pt. +[2025-05-06 05:11:52,188] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_01-model_states.pt... +[2025-05-06 05:11:52,195] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_01-model_states.pt. +[2025-05-06 05:11:52,201] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_02-model_states.pt... +[2025-05-06 05:11:52,208] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_02-model_states.pt. +[2025-05-06 05:11:52,211] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_03-model_states.pt... +[2025-05-06 05:11:52,218] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_03-model_states.pt. +[2025-05-06 05:11:52,222] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_04-model_states.pt... +[2025-05-06 05:11:52,228] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_04-model_states.pt. +[2025-05-06 05:11:52,232] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_05-model_states.pt... +[2025-05-06 05:11:52,238] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_05-model_states.pt. +[2025-05-06 05:11:52,242] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_06-model_states.pt... +[2025-05-06 05:11:52,248] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_06-model_states.pt. +[2025-05-06 05:11:52,252] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_07-model_states.pt... +[2025-05-06 05:11:52,258] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_07-model_states.pt. +[2025-05-06 05:11:52,262] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_08-model_states.pt... +[2025-05-06 05:11:52,268] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_08-model_states.pt. +[2025-05-06 05:11:52,272] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_09-model_states.pt... +[2025-05-06 05:11:52,279] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_09-model_states.pt. +[2025-05-06 05:11:52,282] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_10-model_states.pt... +[2025-05-06 05:11:52,289] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_10-model_states.pt. +[2025-05-06 05:11:52,293] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_11-model_states.pt... +[2025-05-06 05:11:52,299] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_11-model_states.pt. +[2025-05-06 05:11:52,302] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_12-model_states.pt... +[2025-05-06 05:11:52,309] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_12-model_states.pt. +[2025-05-06 05:11:52,312] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_13-model_states.pt... +[2025-05-06 05:11:52,318] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_13-model_states.pt. +[2025-05-06 05:11:52,322] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_14-model_states.pt... +[2025-05-06 05:11:52,328] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_14-model_states.pt. +[2025-05-06 05:11:52,332] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_15-model_states.pt... +[2025-05-06 05:11:52,338] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_15-model_states.pt. +[2025-05-06 05:11:52,341] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_16-model_states.pt... +[2025-05-06 05:11:52,348] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_16-model_states.pt. +[2025-05-06 05:11:52,351] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_17-model_states.pt... +[2025-05-06 05:11:52,358] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_17-model_states.pt. +[2025-05-06 05:11:52,361] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_18-model_states.pt... +[2025-05-06 05:11:52,368] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_18-model_states.pt. +[2025-05-06 05:11:52,371] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_19-model_states.pt... +[2025-05-06 05:11:52,377] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_19-model_states.pt. +[2025-05-06 05:11:52,381] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_20-model_states.pt... +[2025-05-06 05:11:52,387] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_20-model_states.pt. +[2025-05-06 05:11:52,391] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_21-model_states.pt... +[2025-05-06 05:11:52,397] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_21-model_states.pt. +[2025-05-06 05:11:52,400] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_22-model_states.pt... +[2025-05-06 05:11:52,407] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_22-model_states.pt. +[2025-05-06 05:11:52,410] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_23-model_states.pt... +[2025-05-06 05:11:52,417] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_23-model_states.pt. +[2025-05-06 05:11:52,420] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_24-model_states.pt... +[2025-05-06 05:11:52,427] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_24-model_states.pt. +[2025-05-06 05:11:52,430] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_25-model_states.pt... +[2025-05-06 05:11:52,436] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_25-model_states.pt. +[2025-05-06 05:11:52,440] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_26-model_states.pt... +[2025-05-06 05:11:52,446] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_26-model_states.pt. +[2025-05-06 05:11:52,449] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_27-model_states.pt... +[2025-05-06 05:11:52,456] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_27-model_states.pt. +[2025-05-06 05:11:52,459] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_28-model_states.pt... +[2025-05-06 05:11:52,466] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_28-model_states.pt. +[2025-05-06 05:11:52,469] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_29-model_states.pt... +[2025-05-06 05:11:52,476] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_29-model_states.pt. +[2025-05-06 05:11:52,479] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_30-model_states.pt... +[2025-05-06 05:11:52,485] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_30-model_states.pt. +[2025-05-06 05:11:52,489] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_31-model_states.pt... +[2025-05-06 05:11:52,495] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_31-model_states.pt. +[2025-05-06 05:11:52,498] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_32-model_states.pt... +[2025-05-06 05:11:52,505] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_32-model_states.pt. +[2025-05-06 05:11:52,508] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_33-model_states.pt... +[2025-05-06 05:11:52,515] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_33-model_states.pt. +[2025-05-06 05:11:52,518] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_34-model_states.pt... +[2025-05-06 05:11:52,525] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_34-model_states.pt. +[2025-05-06 05:11:52,528] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_35-model_states.pt... +[2025-05-06 05:11:52,534] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_35-model_states.pt. +[2025-05-06 05:11:52,538] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_36-model_states.pt... +[2025-05-06 05:11:52,544] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_36-model_states.pt. +[2025-05-06 05:11:52,548] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_37-model_states.pt... +[2025-05-06 05:11:52,555] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_37-model_states.pt. +[2025-05-06 05:11:52,558] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_38-model_states.pt... +[2025-05-06 05:11:52,564] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_38-model_states.pt. +[2025-05-06 05:11:52,568] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_39-model_states.pt... +[2025-05-06 05:11:52,574] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_39-model_states.pt. +[2025-05-06 05:11:52,578] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_40-model_states.pt... +[2025-05-06 05:11:52,584] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_40-model_states.pt. +[2025-05-06 05:11:52,585] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_41-model_states.pt... +[2025-05-06 05:11:52,585] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/layer_41-model_states.pt. +[2025-05-06 05:11:52,603] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/mp_rank_00_model_states.pt +[2025-05-06 05:11:52,603] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/mp_rank_00_model_states.pt... +[2025-05-06 05:11:53,607] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /workspace/ComfyUI/models/loras/out/20250505_19-10-35/global_step3366/mp_rank_00_model_states.pt. +[2025-05-06 05:11:53,607] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step3366 is ready now! +[2025-05-06 05:12:04,280] [INFO] [logging.py:107:log_dist] [Rank 0] step=3367, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3367 loss: 0.0938 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:12:14,951] [INFO] [logging.py:107:log_dist] [Rank 0] step=3368, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3368 loss: 0.1178 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:12:25,621] [INFO] [logging.py:107:log_dist] [Rank 0] step=3369, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3369 loss: 0.1062 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:12:36,458] [INFO] [logging.py:107:log_dist] [Rank 0] step=3370, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3370 loss: 0.0258 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 05:12:47,131] [INFO] [logging.py:107:log_dist] [Rank 0] step=3371, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3371 loss: 0.0984 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:12:57,801] [INFO] [logging.py:107:log_dist] [Rank 0] step=3372, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3372 loss: 0.0667 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:13:08,474] [INFO] [logging.py:107:log_dist] [Rank 0] step=3373, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3373 loss: 0.0418 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:13:19,152] [INFO] [logging.py:107:log_dist] [Rank 0] step=3374, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3374 loss: 0.0741 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 05:13:29,823] [INFO] [logging.py:107:log_dist] [Rank 0] step=3375, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3375 loss: 0.0361 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:13:40,494] [INFO] [logging.py:107:log_dist] [Rank 0] step=3376, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3376 loss: 0.1438 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:13:51,169] [INFO] [logging.py:107:log_dist] [Rank 0] step=3377, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3377 loss: 0.0579 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:14:02,035] [INFO] [logging.py:107:log_dist] [Rank 0] step=3378, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3378 loss: 0.0477 iter time (s): 10.834 samples/sec: 0.092 +[2025-05-06 05:14:12,708] [INFO] [logging.py:107:log_dist] [Rank 0] step=3379, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3379 loss: 0.0320 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:14:23,385] [INFO] [logging.py:107:log_dist] [Rank 0] step=3380, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3380 loss: 0.0355 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:14:34,053] [INFO] [logging.py:107:log_dist] [Rank 0] step=3381, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3381 loss: 0.1451 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:14:44,724] [INFO] [logging.py:107:log_dist] [Rank 0] step=3382, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3382 loss: 0.0608 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:14:55,404] [INFO] [logging.py:107:log_dist] [Rank 0] step=3383, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3383 loss: 0.0840 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:15:06,078] [INFO] [logging.py:107:log_dist] [Rank 0] step=3384, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3384 loss: 0.0282 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:15:16,763] [INFO] [logging.py:107:log_dist] [Rank 0] step=3385, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3385 loss: 0.0475 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-06 05:15:27,436] [INFO] [logging.py:107:log_dist] [Rank 0] step=3386, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3386 loss: 0.0441 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:15:38,296] [INFO] [logging.py:107:log_dist] [Rank 0] step=3387, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3387 loss: 0.0287 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 05:15:48,968] [INFO] [logging.py:107:log_dist] [Rank 0] step=3388, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3388 loss: 0.0500 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:15:59,639] [INFO] [logging.py:107:log_dist] [Rank 0] step=3389, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3389 loss: 0.0783 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:16:10,310] [INFO] [logging.py:107:log_dist] [Rank 0] step=3390, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3390 loss: 0.0531 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:16:20,986] [INFO] [logging.py:107:log_dist] [Rank 0] step=3391, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3391 loss: 0.0353 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:16:31,657] [INFO] [logging.py:107:log_dist] [Rank 0] step=3392, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3392 loss: 0.1330 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:16:42,330] [INFO] [logging.py:107:log_dist] [Rank 0] step=3393, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3393 loss: 0.0348 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:16:53,015] [INFO] [logging.py:107:log_dist] [Rank 0] step=3394, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3394 loss: 0.0859 iter time (s): 10.655 samples/sec: 0.094 +[2025-05-06 05:17:03,688] [INFO] [logging.py:107:log_dist] [Rank 0] step=3395, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3395 loss: 0.0298 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:17:14,552] [INFO] [logging.py:107:log_dist] [Rank 0] step=3396, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3396 loss: 0.0358 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-06 05:17:25,228] [INFO] [logging.py:107:log_dist] [Rank 0] step=3397, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3397 loss: 0.1583 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:17:35,901] [INFO] [logging.py:107:log_dist] [Rank 0] step=3398, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3398 loss: 0.0461 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:17:46,569] [INFO] [logging.py:107:log_dist] [Rank 0] step=3399, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3399 loss: 0.0839 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:17:57,242] [INFO] [logging.py:107:log_dist] [Rank 0] step=3400, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3400 loss: 0.1033 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:18:07,910] [INFO] [logging.py:107:log_dist] [Rank 0] step=3401, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3401 loss: 0.0383 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:18:18,577] [INFO] [logging.py:107:log_dist] [Rank 0] step=3402, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3402 loss: 0.1654 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:18:29,257] [INFO] [logging.py:107:log_dist] [Rank 0] step=3403, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3403 loss: 0.1137 iter time (s): 10.653 samples/sec: 0.094 +Started new epoch: 84 +[2025-05-06 05:18:40,442] [INFO] [logging.py:107:log_dist] [Rank 0] step=3404, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3404 loss: 0.0440 iter time (s): 10.807 samples/sec: 0.093 +[2025-05-06 05:18:51,113] [INFO] [logging.py:107:log_dist] [Rank 0] step=3405, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3405 loss: 0.1122 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:19:01,788] [INFO] [logging.py:107:log_dist] [Rank 0] step=3406, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3406 loss: 0.0335 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:19:12,458] [INFO] [logging.py:107:log_dist] [Rank 0] step=3407, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3407 loss: 0.0360 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:19:23,131] [INFO] [logging.py:107:log_dist] [Rank 0] step=3408, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3408 loss: 0.0512 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:19:33,804] [INFO] [logging.py:107:log_dist] [Rank 0] step=3409, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3409 loss: 0.0398 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:19:44,477] [INFO] [logging.py:107:log_dist] [Rank 0] step=3410, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3410 loss: 0.0590 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:19:55,153] [INFO] [logging.py:107:log_dist] [Rank 0] step=3411, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3411 loss: 0.1243 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:20:05,827] [INFO] [logging.py:107:log_dist] [Rank 0] step=3412, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3412 loss: 0.0392 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:20:16,657] [INFO] [logging.py:107:log_dist] [Rank 0] step=3413, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3413 loss: 0.0753 iter time (s): 10.799 samples/sec: 0.093 +[2025-05-06 05:20:27,329] [INFO] [logging.py:107:log_dist] [Rank 0] step=3414, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3414 loss: 0.4225 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:20:38,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=3415, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3415 loss: 0.0289 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 05:20:48,678] [INFO] [logging.py:107:log_dist] [Rank 0] step=3416, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3416 loss: 0.0406 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:20:59,347] [INFO] [logging.py:107:log_dist] [Rank 0] step=3417, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3417 loss: 0.0294 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:21:10,020] [INFO] [logging.py:107:log_dist] [Rank 0] step=3418, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3418 loss: 0.0414 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:21:20,693] [INFO] [logging.py:107:log_dist] [Rank 0] step=3419, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3419 loss: 0.1650 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:21:31,362] [INFO] [logging.py:107:log_dist] [Rank 0] step=3420, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3420 loss: 0.0786 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:21:42,229] [INFO] [logging.py:107:log_dist] [Rank 0] step=3421, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3421 loss: 0.0800 iter time (s): 10.836 samples/sec: 0.092 +[2025-05-06 05:21:52,896] [INFO] [logging.py:107:log_dist] [Rank 0] step=3422, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3422 loss: 0.0539 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 05:22:03,569] [INFO] [logging.py:107:log_dist] [Rank 0] step=3423, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3423 loss: 0.0303 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:22:14,242] [INFO] [logging.py:107:log_dist] [Rank 0] step=3424, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3424 loss: 0.0364 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:22:24,914] [INFO] [logging.py:107:log_dist] [Rank 0] step=3425, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3425 loss: 0.0657 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:22:35,587] [INFO] [logging.py:107:log_dist] [Rank 0] step=3426, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3426 loss: 0.1190 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:22:46,267] [INFO] [logging.py:107:log_dist] [Rank 0] step=3427, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3427 loss: 0.0633 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 05:22:56,937] [INFO] [logging.py:107:log_dist] [Rank 0] step=3428, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3428 loss: 0.1385 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:23:07,606] [INFO] [logging.py:107:log_dist] [Rank 0] step=3429, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3429 loss: 0.0294 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:23:18,469] [INFO] [logging.py:107:log_dist] [Rank 0] step=3430, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3430 loss: 0.0913 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-06 05:23:29,140] [INFO] [logging.py:107:log_dist] [Rank 0] step=3431, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3431 loss: 0.1152 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:23:39,827] [INFO] [logging.py:107:log_dist] [Rank 0] step=3432, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3432 loss: 0.0366 iter time (s): 10.656 samples/sec: 0.094 +[2025-05-06 05:23:50,503] [INFO] [logging.py:107:log_dist] [Rank 0] step=3433, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3433 loss: 0.0284 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:24:01,181] [INFO] [logging.py:107:log_dist] [Rank 0] step=3434, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3434 loss: 0.2955 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 05:24:11,851] [INFO] [logging.py:107:log_dist] [Rank 0] step=3435, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3435 loss: 0.0768 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:24:22,523] [INFO] [logging.py:107:log_dist] [Rank 0] step=3436, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3436 loss: 0.0339 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:24:33,197] [INFO] [logging.py:107:log_dist] [Rank 0] step=3437, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3437 loss: 0.0901 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:24:43,866] [INFO] [logging.py:107:log_dist] [Rank 0] step=3438, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3438 loss: 0.0360 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:24:54,703] [INFO] [logging.py:107:log_dist] [Rank 0] step=3439, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3439 loss: 0.0522 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-06 05:25:05,377] [INFO] [logging.py:107:log_dist] [Rank 0] step=3440, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3440 loss: 0.1586 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:25:16,055] [INFO] [logging.py:107:log_dist] [Rank 0] step=3441, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3441 loss: 0.0623 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 05:25:26,732] [INFO] [logging.py:107:log_dist] [Rank 0] step=3442, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3442 loss: 0.0405 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:25:37,405] [INFO] [logging.py:107:log_dist] [Rank 0] step=3443, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3443 loss: 0.0434 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:25:48,074] [INFO] [logging.py:107:log_dist] [Rank 0] step=3444, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3444 loss: 0.0666 iter time (s): 10.642 samples/sec: 0.094 +Started new epoch: 85 +[2025-05-06 05:25:59,081] [INFO] [logging.py:107:log_dist] [Rank 0] step=3445, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3445 loss: 0.0767 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:26:09,751] [INFO] [logging.py:107:log_dist] [Rank 0] step=3446, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3446 loss: 0.0866 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:26:20,589] [INFO] [logging.py:107:log_dist] [Rank 0] step=3447, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3447 loss: 0.0510 iter time (s): 10.807 samples/sec: 0.093 +[2025-05-06 05:26:31,261] [INFO] [logging.py:107:log_dist] [Rank 0] step=3448, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3448 loss: 0.1094 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:26:41,934] [INFO] [logging.py:107:log_dist] [Rank 0] step=3449, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3449 loss: 0.0639 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:26:52,608] [INFO] [logging.py:107:log_dist] [Rank 0] step=3450, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3450 loss: 0.0525 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:27:03,299] [INFO] [logging.py:107:log_dist] [Rank 0] step=3451, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3451 loss: 0.0421 iter time (s): 10.660 samples/sec: 0.094 +[2025-05-06 05:27:13,977] [INFO] [logging.py:107:log_dist] [Rank 0] step=3452, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3452 loss: 0.0661 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:27:24,650] [INFO] [logging.py:107:log_dist] [Rank 0] step=3453, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3453 loss: 0.1933 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:27:35,320] [INFO] [logging.py:107:log_dist] [Rank 0] step=3454, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3454 loss: 0.0642 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:27:45,994] [INFO] [logging.py:107:log_dist] [Rank 0] step=3455, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3455 loss: 0.1522 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:27:56,831] [INFO] [logging.py:107:log_dist] [Rank 0] step=3456, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3456 loss: 0.0377 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-06 05:28:07,504] [INFO] [logging.py:107:log_dist] [Rank 0] step=3457, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3457 loss: 0.0308 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:28:18,174] [INFO] [logging.py:107:log_dist] [Rank 0] step=3458, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3458 loss: 0.0244 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:28:28,852] [INFO] [logging.py:107:log_dist] [Rank 0] step=3459, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3459 loss: 0.0530 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 05:28:39,521] [INFO] [logging.py:107:log_dist] [Rank 0] step=3460, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3460 loss: 0.0390 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:28:50,191] [INFO] [logging.py:107:log_dist] [Rank 0] step=3461, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3461 loss: 0.0284 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:29:00,863] [INFO] [logging.py:107:log_dist] [Rank 0] step=3462, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3462 loss: 0.0516 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:29:11,537] [INFO] [logging.py:107:log_dist] [Rank 0] step=3463, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3463 loss: 0.0320 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:29:22,397] [INFO] [logging.py:107:log_dist] [Rank 0] step=3464, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3464 loss: 0.0344 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 05:29:33,069] [INFO] [logging.py:107:log_dist] [Rank 0] step=3465, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3465 loss: 0.0809 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:29:43,736] [INFO] [logging.py:107:log_dist] [Rank 0] step=3466, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3466 loss: 0.0335 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:29:54,408] [INFO] [logging.py:107:log_dist] [Rank 0] step=3467, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3467 loss: 0.0941 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:30:05,085] [INFO] [logging.py:107:log_dist] [Rank 0] step=3468, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3468 loss: 0.0442 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:30:15,756] [INFO] [logging.py:107:log_dist] [Rank 0] step=3469, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3469 loss: 0.2915 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:30:26,430] [INFO] [logging.py:107:log_dist] [Rank 0] step=3470, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3470 loss: 0.1018 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:30:37,113] [INFO] [logging.py:107:log_dist] [Rank 0] step=3471, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3471 loss: 0.0417 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:30:47,784] [INFO] [logging.py:107:log_dist] [Rank 0] step=3472, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3472 loss: 0.0785 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:30:58,645] [INFO] [logging.py:107:log_dist] [Rank 0] step=3473, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3473 loss: 0.0345 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 05:31:09,322] [INFO] [logging.py:107:log_dist] [Rank 0] step=3474, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3474 loss: 0.0299 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 05:31:19,998] [INFO] [logging.py:107:log_dist] [Rank 0] step=3475, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3475 loss: 0.0965 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:31:30,671] [INFO] [logging.py:107:log_dist] [Rank 0] step=3476, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3476 loss: 0.1328 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:31:41,344] [INFO] [logging.py:107:log_dist] [Rank 0] step=3477, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3477 loss: 0.0649 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:31:52,014] [INFO] [logging.py:107:log_dist] [Rank 0] step=3478, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3478 loss: 0.0913 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:32:02,687] [INFO] [logging.py:107:log_dist] [Rank 0] step=3479, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3479 loss: 0.0374 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:32:13,360] [INFO] [logging.py:107:log_dist] [Rank 0] step=3480, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3480 loss: 0.0265 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:32:24,028] [INFO] [logging.py:107:log_dist] [Rank 0] step=3481, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3481 loss: 0.4991 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:32:34,857] [INFO] [logging.py:107:log_dist] [Rank 0] step=3482, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3482 loss: 0.0426 iter time (s): 10.798 samples/sec: 0.093 +[2025-05-06 05:32:45,531] [INFO] [logging.py:107:log_dist] [Rank 0] step=3483, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3483 loss: 0.0942 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:32:56,198] [INFO] [logging.py:107:log_dist] [Rank 0] step=3484, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3484 loss: 0.0361 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:33:06,866] [INFO] [logging.py:107:log_dist] [Rank 0] step=3485, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3485 loss: 0.0612 iter time (s): 10.641 samples/sec: 0.094 +Started new epoch: 86 +[2025-05-06 05:33:17,880] [INFO] [logging.py:107:log_dist] [Rank 0] step=3486, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3486 loss: 0.0685 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:33:28,552] [INFO] [logging.py:107:log_dist] [Rank 0] step=3487, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3487 loss: 0.0302 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:33:39,220] [INFO] [logging.py:107:log_dist] [Rank 0] step=3488, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3488 loss: 0.0729 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:33:49,892] [INFO] [logging.py:107:log_dist] [Rank 0] step=3489, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3489 loss: 0.0487 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:34:00,758] [INFO] [logging.py:107:log_dist] [Rank 0] step=3490, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3490 loss: 0.0412 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 05:34:11,435] [INFO] [logging.py:107:log_dist] [Rank 0] step=3491, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3491 loss: 0.1211 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:34:22,113] [INFO] [logging.py:107:log_dist] [Rank 0] step=3492, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3492 loss: 0.0888 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:34:32,789] [INFO] [logging.py:107:log_dist] [Rank 0] step=3493, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3493 loss: 0.0339 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:34:43,461] [INFO] [logging.py:107:log_dist] [Rank 0] step=3494, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3494 loss: 0.1653 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:34:54,139] [INFO] [logging.py:107:log_dist] [Rank 0] step=3495, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3495 loss: 0.2389 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 05:35:04,814] [INFO] [logging.py:107:log_dist] [Rank 0] step=3496, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3496 loss: 0.0365 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:35:15,489] [INFO] [logging.py:107:log_dist] [Rank 0] step=3497, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3497 loss: 0.0504 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:35:26,167] [INFO] [logging.py:107:log_dist] [Rank 0] step=3498, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3498 loss: 0.1550 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 05:35:37,026] [INFO] [logging.py:107:log_dist] [Rank 0] step=3499, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3499 loss: 0.0943 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 05:35:47,701] [INFO] [logging.py:107:log_dist] [Rank 0] step=3500, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3500 loss: 0.1403 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:35:58,388] [INFO] [logging.py:107:log_dist] [Rank 0] step=3501, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3501 loss: 0.0880 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 05:36:09,063] [INFO] [logging.py:107:log_dist] [Rank 0] step=3502, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3502 loss: 0.3178 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:36:19,739] [INFO] [logging.py:107:log_dist] [Rank 0] step=3503, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3503 loss: 0.0338 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:36:30,410] [INFO] [logging.py:107:log_dist] [Rank 0] step=3504, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3504 loss: 0.0407 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:36:41,084] [INFO] [logging.py:107:log_dist] [Rank 0] step=3505, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3505 loss: 0.1463 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:36:51,757] [INFO] [logging.py:107:log_dist] [Rank 0] step=3506, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3506 loss: 0.0410 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:37:02,426] [INFO] [logging.py:107:log_dist] [Rank 0] step=3507, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3507 loss: 0.0985 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:37:13,264] [INFO] [logging.py:107:log_dist] [Rank 0] step=3508, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3508 loss: 0.0793 iter time (s): 10.806 samples/sec: 0.093 +[2025-05-06 05:37:23,936] [INFO] [logging.py:107:log_dist] [Rank 0] step=3509, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3509 loss: 0.0560 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:37:34,604] [INFO] [logging.py:107:log_dist] [Rank 0] step=3510, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3510 loss: 0.0686 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:37:45,274] [INFO] [logging.py:107:log_dist] [Rank 0] step=3511, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3511 loss: 0.0983 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:37:55,947] [INFO] [logging.py:107:log_dist] [Rank 0] step=3512, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3512 loss: 0.1024 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:38:06,616] [INFO] [logging.py:107:log_dist] [Rank 0] step=3513, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3513 loss: 0.0348 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:38:17,287] [INFO] [logging.py:107:log_dist] [Rank 0] step=3514, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3514 loss: 0.0456 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:38:27,961] [INFO] [logging.py:107:log_dist] [Rank 0] step=3515, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3515 loss: 0.0345 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:38:38,799] [INFO] [logging.py:107:log_dist] [Rank 0] step=3516, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3516 loss: 0.0405 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-06 05:38:49,471] [INFO] [logging.py:107:log_dist] [Rank 0] step=3517, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3517 loss: 0.0505 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:39:00,148] [INFO] [logging.py:107:log_dist] [Rank 0] step=3518, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3518 loss: 0.0339 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:39:10,821] [INFO] [logging.py:107:log_dist] [Rank 0] step=3519, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3519 loss: 0.1070 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:39:21,491] [INFO] [logging.py:107:log_dist] [Rank 0] step=3520, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3520 loss: 0.0228 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:39:32,167] [INFO] [logging.py:107:log_dist] [Rank 0] step=3521, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3521 loss: 0.0307 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:39:42,834] [INFO] [logging.py:107:log_dist] [Rank 0] step=3522, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3522 loss: 0.1794 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 05:39:53,510] [INFO] [logging.py:107:log_dist] [Rank 0] step=3523, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3523 loss: 0.0539 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:40:04,186] [INFO] [logging.py:107:log_dist] [Rank 0] step=3524, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3524 loss: 0.0891 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:40:15,018] [INFO] [logging.py:107:log_dist] [Rank 0] step=3525, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3525 loss: 0.1894 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-06 05:40:25,683] [INFO] [logging.py:107:log_dist] [Rank 0] step=3526, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3526 loss: 0.0759 iter time (s): 10.639 samples/sec: 0.094 +Started new epoch: 87 +[2025-05-06 05:40:36,697] [INFO] [logging.py:107:log_dist] [Rank 0] step=3527, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3527 loss: 0.0502 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:40:47,366] [INFO] [logging.py:107:log_dist] [Rank 0] step=3528, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3528 loss: 0.0290 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:40:58,035] [INFO] [logging.py:107:log_dist] [Rank 0] step=3529, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3529 loss: 0.0331 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:41:08,709] [INFO] [logging.py:107:log_dist] [Rank 0] step=3530, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3530 loss: 0.0819 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:41:19,383] [INFO] [logging.py:107:log_dist] [Rank 0] step=3531, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3531 loss: 0.1043 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:41:30,056] [INFO] [logging.py:107:log_dist] [Rank 0] step=3532, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3532 loss: 0.0734 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:41:40,931] [INFO] [logging.py:107:log_dist] [Rank 0] step=3533, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3533 loss: 0.0314 iter time (s): 10.829 samples/sec: 0.092 +[2025-05-06 05:41:51,606] [INFO] [logging.py:107:log_dist] [Rank 0] step=3534, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3534 loss: 0.0373 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:42:02,277] [INFO] [logging.py:107:log_dist] [Rank 0] step=3535, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3535 loss: 0.0424 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:42:12,953] [INFO] [logging.py:107:log_dist] [Rank 0] step=3536, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3536 loss: 0.0585 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:42:23,625] [INFO] [logging.py:107:log_dist] [Rank 0] step=3537, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3537 loss: 0.0655 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:42:34,294] [INFO] [logging.py:107:log_dist] [Rank 0] step=3538, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3538 loss: 0.0364 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:42:44,967] [INFO] [logging.py:107:log_dist] [Rank 0] step=3539, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3539 loss: 0.0363 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:42:55,644] [INFO] [logging.py:107:log_dist] [Rank 0] step=3540, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3540 loss: 0.0400 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:43:06,315] [INFO] [logging.py:107:log_dist] [Rank 0] step=3541, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3541 loss: 0.0257 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:43:17,180] [INFO] [logging.py:107:log_dist] [Rank 0] step=3542, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3542 loss: 0.0507 iter time (s): 10.834 samples/sec: 0.092 +[2025-05-06 05:43:27,848] [INFO] [logging.py:107:log_dist] [Rank 0] step=3543, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3543 loss: 0.0876 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:43:38,520] [INFO] [logging.py:107:log_dist] [Rank 0] step=3544, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3544 loss: 0.0689 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:43:49,204] [INFO] [logging.py:107:log_dist] [Rank 0] step=3545, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3545 loss: 0.0273 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-06 05:43:59,879] [INFO] [logging.py:107:log_dist] [Rank 0] step=3546, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3546 loss: 0.0748 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:44:10,551] [INFO] [logging.py:107:log_dist] [Rank 0] step=3547, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3547 loss: 0.1515 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:44:21,223] [INFO] [logging.py:107:log_dist] [Rank 0] step=3548, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3548 loss: 0.2347 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:44:31,894] [INFO] [logging.py:107:log_dist] [Rank 0] step=3549, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3549 loss: 0.1670 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:44:42,565] [INFO] [logging.py:107:log_dist] [Rank 0] step=3550, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3550 loss: 0.1559 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:44:53,405] [INFO] [logging.py:107:log_dist] [Rank 0] step=3551, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3551 loss: 0.0661 iter time (s): 10.808 samples/sec: 0.093 +[2025-05-06 05:45:04,076] [INFO] [logging.py:107:log_dist] [Rank 0] step=3552, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3552 loss: 0.1567 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:45:14,749] [INFO] [logging.py:107:log_dist] [Rank 0] step=3553, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3553 loss: 0.0984 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:45:25,420] [INFO] [logging.py:107:log_dist] [Rank 0] step=3554, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3554 loss: 0.0337 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:45:36,092] [INFO] [logging.py:107:log_dist] [Rank 0] step=3555, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3555 loss: 0.0703 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:45:46,768] [INFO] [logging.py:107:log_dist] [Rank 0] step=3556, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3556 loss: 0.0284 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:45:57,443] [INFO] [logging.py:107:log_dist] [Rank 0] step=3557, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3557 loss: 0.3952 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:46:08,114] [INFO] [logging.py:107:log_dist] [Rank 0] step=3558, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3558 loss: 0.0332 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:46:18,978] [INFO] [logging.py:107:log_dist] [Rank 0] step=3559, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3559 loss: 0.1403 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-06 05:46:29,648] [INFO] [logging.py:107:log_dist] [Rank 0] step=3560, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3560 loss: 0.0490 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:46:40,321] [INFO] [logging.py:107:log_dist] [Rank 0] step=3561, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3561 loss: 0.0640 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:46:50,994] [INFO] [logging.py:107:log_dist] [Rank 0] step=3562, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3562 loss: 0.1051 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:47:01,665] [INFO] [logging.py:107:log_dist] [Rank 0] step=3563, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3563 loss: 0.0514 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:47:12,332] [INFO] [logging.py:107:log_dist] [Rank 0] step=3564, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3564 loss: 0.1010 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 05:47:23,005] [INFO] [logging.py:107:log_dist] [Rank 0] step=3565, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3565 loss: 0.6048 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:47:33,675] [INFO] [logging.py:107:log_dist] [Rank 0] step=3566, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3566 loss: 0.1043 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:47:44,344] [INFO] [logging.py:107:log_dist] [Rank 0] step=3567, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3567 loss: 0.0492 iter time (s): 10.642 samples/sec: 0.094 +Started new epoch: 88 +[2025-05-06 05:47:55,563] [INFO] [logging.py:107:log_dist] [Rank 0] step=3568, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3568 loss: 0.0487 iter time (s): 10.834 samples/sec: 0.092 +[2025-05-06 05:48:06,237] [INFO] [logging.py:107:log_dist] [Rank 0] step=3569, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3569 loss: 0.0422 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:48:16,907] [INFO] [logging.py:107:log_dist] [Rank 0] step=3570, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3570 loss: 0.1091 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:48:27,581] [INFO] [logging.py:107:log_dist] [Rank 0] step=3571, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3571 loss: 0.0583 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:48:38,255] [INFO] [logging.py:107:log_dist] [Rank 0] step=3572, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3572 loss: 0.1639 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:48:48,927] [INFO] [logging.py:107:log_dist] [Rank 0] step=3573, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3573 loss: 0.0627 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 05:48:59,598] [INFO] [logging.py:107:log_dist] [Rank 0] step=3574, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3574 loss: 0.0384 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:49:10,272] [INFO] [logging.py:107:log_dist] [Rank 0] step=3575, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3575 loss: 0.0936 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:49:20,939] [INFO] [logging.py:107:log_dist] [Rank 0] step=3576, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3576 loss: 0.1845 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 05:49:31,770] [INFO] [logging.py:107:log_dist] [Rank 0] step=3577, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3577 loss: 0.0278 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 05:49:42,438] [INFO] [logging.py:107:log_dist] [Rank 0] step=3578, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3578 loss: 0.0628 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:49:53,111] [INFO] [logging.py:107:log_dist] [Rank 0] step=3579, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3579 loss: 0.0767 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:50:03,785] [INFO] [logging.py:107:log_dist] [Rank 0] step=3580, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3580 loss: 0.2503 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:50:14,458] [INFO] [logging.py:107:log_dist] [Rank 0] step=3581, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3581 loss: 0.1223 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:50:25,128] [INFO] [logging.py:107:log_dist] [Rank 0] step=3582, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3582 loss: 0.1200 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:50:35,800] [INFO] [logging.py:107:log_dist] [Rank 0] step=3583, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3583 loss: 0.0284 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:50:46,473] [INFO] [logging.py:107:log_dist] [Rank 0] step=3584, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3584 loss: 0.0408 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:50:57,304] [INFO] [logging.py:107:log_dist] [Rank 0] step=3585, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3585 loss: 0.0580 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 05:51:07,976] [INFO] [logging.py:107:log_dist] [Rank 0] step=3586, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3586 loss: 0.0330 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:51:18,651] [INFO] [logging.py:107:log_dist] [Rank 0] step=3587, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3587 loss: 0.0399 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:51:29,322] [INFO] [logging.py:107:log_dist] [Rank 0] step=3588, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3588 loss: 0.1481 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:51:39,999] [INFO] [logging.py:107:log_dist] [Rank 0] step=3589, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3589 loss: 0.0456 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:51:50,669] [INFO] [logging.py:107:log_dist] [Rank 0] step=3590, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3590 loss: 0.0456 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:52:01,340] [INFO] [logging.py:107:log_dist] [Rank 0] step=3591, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3591 loss: 0.0329 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:52:12,026] [INFO] [logging.py:107:log_dist] [Rank 0] step=3592, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3592 loss: 0.0435 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 05:52:22,698] [INFO] [logging.py:107:log_dist] [Rank 0] step=3593, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3593 loss: 0.0633 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:52:33,534] [INFO] [logging.py:107:log_dist] [Rank 0] step=3594, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3594 loss: 0.0765 iter time (s): 10.805 samples/sec: 0.093 +[2025-05-06 05:52:44,209] [INFO] [logging.py:107:log_dist] [Rank 0] step=3595, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3595 loss: 0.0682 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:52:54,882] [INFO] [logging.py:107:log_dist] [Rank 0] step=3596, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3596 loss: 0.0753 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:53:05,557] [INFO] [logging.py:107:log_dist] [Rank 0] step=3597, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3597 loss: 0.0290 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:53:16,230] [INFO] [logging.py:107:log_dist] [Rank 0] step=3598, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3598 loss: 0.1869 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:53:26,903] [INFO] [logging.py:107:log_dist] [Rank 0] step=3599, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3599 loss: 0.1646 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:53:37,575] [INFO] [logging.py:107:log_dist] [Rank 0] step=3600, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3600 loss: 0.0292 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:53:48,251] [INFO] [logging.py:107:log_dist] [Rank 0] step=3601, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3601 loss: 0.2153 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:53:59,110] [INFO] [logging.py:107:log_dist] [Rank 0] step=3602, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3602 loss: 0.0401 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 05:54:09,784] [INFO] [logging.py:107:log_dist] [Rank 0] step=3603, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3603 loss: 0.0344 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:54:20,457] [INFO] [logging.py:107:log_dist] [Rank 0] step=3604, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3604 loss: 0.0564 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:54:31,128] [INFO] [logging.py:107:log_dist] [Rank 0] step=3605, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3605 loss: 0.0362 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:54:41,806] [INFO] [logging.py:107:log_dist] [Rank 0] step=3606, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3606 loss: 0.0897 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 05:54:52,477] [INFO] [logging.py:107:log_dist] [Rank 0] step=3607, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3607 loss: 0.0462 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:55:03,144] [INFO] [logging.py:107:log_dist] [Rank 0] step=3608, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3608 loss: 0.0423 iter time (s): 10.640 samples/sec: 0.094 +Started new epoch: 89 +[2025-05-06 05:55:14,159] [INFO] [logging.py:107:log_dist] [Rank 0] step=3609, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3609 loss: 0.0350 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:55:24,831] [INFO] [logging.py:107:log_dist] [Rank 0] step=3610, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3610 loss: 0.1160 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:55:35,692] [INFO] [logging.py:107:log_dist] [Rank 0] step=3611, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3611 loss: 0.0664 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-06 05:55:46,366] [INFO] [logging.py:107:log_dist] [Rank 0] step=3612, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3612 loss: 0.0519 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:55:57,037] [INFO] [logging.py:107:log_dist] [Rank 0] step=3613, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3613 loss: 0.0476 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:56:07,706] [INFO] [logging.py:107:log_dist] [Rank 0] step=3614, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3614 loss: 0.0582 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:56:18,381] [INFO] [logging.py:107:log_dist] [Rank 0] step=3615, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3615 loss: 0.0959 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:56:29,053] [INFO] [logging.py:107:log_dist] [Rank 0] step=3616, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3616 loss: 0.0592 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:56:39,725] [INFO] [logging.py:107:log_dist] [Rank 0] step=3617, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3617 loss: 0.0890 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:56:50,398] [INFO] [logging.py:107:log_dist] [Rank 0] step=3618, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3618 loss: 0.0361 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 05:57:01,069] [INFO] [logging.py:107:log_dist] [Rank 0] step=3619, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3619 loss: 0.1022 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:57:11,914] [INFO] [logging.py:107:log_dist] [Rank 0] step=3620, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3620 loss: 0.0540 iter time (s): 10.814 samples/sec: 0.092 +[2025-05-06 05:57:22,591] [INFO] [logging.py:107:log_dist] [Rank 0] step=3621, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3621 loss: 0.0982 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 05:57:33,262] [INFO] [logging.py:107:log_dist] [Rank 0] step=3622, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3622 loss: 0.0837 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:57:43,932] [INFO] [logging.py:107:log_dist] [Rank 0] step=3623, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3623 loss: 0.0310 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:57:54,603] [INFO] [logging.py:107:log_dist] [Rank 0] step=3624, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3624 loss: 0.0325 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:58:05,278] [INFO] [logging.py:107:log_dist] [Rank 0] step=3625, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3625 loss: 0.0527 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 05:58:15,949] [INFO] [logging.py:107:log_dist] [Rank 0] step=3626, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3626 loss: 0.0306 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 05:58:26,622] [INFO] [logging.py:107:log_dist] [Rank 0] step=3627, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3627 loss: 0.2749 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:58:37,482] [INFO] [logging.py:107:log_dist] [Rank 0] step=3628, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3628 loss: 0.0313 iter time (s): 10.834 samples/sec: 0.092 +[2025-05-06 05:58:48,162] [INFO] [logging.py:107:log_dist] [Rank 0] step=3629, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3629 loss: 0.0889 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:58:58,836] [INFO] [logging.py:107:log_dist] [Rank 0] step=3630, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3630 loss: 0.0616 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 05:59:09,507] [INFO] [logging.py:107:log_dist] [Rank 0] step=3631, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3631 loss: 0.0591 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:59:20,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=3632, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3632 loss: 0.2389 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 05:59:30,852] [INFO] [logging.py:107:log_dist] [Rank 0] step=3633, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3633 loss: 0.0592 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 05:59:41,521] [INFO] [logging.py:107:log_dist] [Rank 0] step=3634, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3634 loss: 0.0574 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 05:59:52,194] [INFO] [logging.py:107:log_dist] [Rank 0] step=3635, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3635 loss: 0.0273 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:00:02,883] [INFO] [logging.py:107:log_dist] [Rank 0] step=3636, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3636 loss: 0.1896 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 06:00:13,747] [INFO] [logging.py:107:log_dist] [Rank 0] step=3637, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3637 loss: 0.0357 iter time (s): 10.832 samples/sec: 0.092 +[2025-05-06 06:00:24,422] [INFO] [logging.py:107:log_dist] [Rank 0] step=3638, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3638 loss: 0.1098 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:00:35,104] [INFO] [logging.py:107:log_dist] [Rank 0] step=3639, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3639 loss: 0.1228 iter time (s): 10.650 samples/sec: 0.094 +[2025-05-06 06:00:45,778] [INFO] [logging.py:107:log_dist] [Rank 0] step=3640, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3640 loss: 0.0640 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:00:56,447] [INFO] [logging.py:107:log_dist] [Rank 0] step=3641, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3641 loss: 0.0365 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:01:07,123] [INFO] [logging.py:107:log_dist] [Rank 0] step=3642, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3642 loss: 0.0570 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:01:17,793] [INFO] [logging.py:107:log_dist] [Rank 0] step=3643, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3643 loss: 0.0255 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:01:28,461] [INFO] [logging.py:107:log_dist] [Rank 0] step=3644, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3644 loss: 0.0278 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:01:39,137] [INFO] [logging.py:107:log_dist] [Rank 0] step=3645, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3645 loss: 0.0938 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:01:49,961] [INFO] [logging.py:107:log_dist] [Rank 0] step=3646, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3646 loss: 0.0935 iter time (s): 10.795 samples/sec: 0.093 +[2025-05-06 06:02:00,632] [INFO] [logging.py:107:log_dist] [Rank 0] step=3647, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3647 loss: 0.0724 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:02:11,306] [INFO] [logging.py:107:log_dist] [Rank 0] step=3648, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3648 loss: 0.0885 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:02:21,972] [INFO] [logging.py:107:log_dist] [Rank 0] step=3649, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3649 loss: 0.0525 iter time (s): 10.639 samples/sec: 0.094 +Started new epoch: 90 +[2025-05-06 06:02:32,990] [INFO] [logging.py:107:log_dist] [Rank 0] step=3650, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3650 loss: 0.0721 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:02:43,664] [INFO] [logging.py:107:log_dist] [Rank 0] step=3651, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3651 loss: 0.1148 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:02:54,334] [INFO] [logging.py:107:log_dist] [Rank 0] step=3652, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3652 loss: 0.0315 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:03:05,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=3653, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3653 loss: 0.0285 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:03:15,854] [INFO] [logging.py:107:log_dist] [Rank 0] step=3654, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3654 loss: 0.0458 iter time (s): 10.815 samples/sec: 0.092 +[2025-05-06 06:03:26,525] [INFO] [logging.py:107:log_dist] [Rank 0] step=3655, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3655 loss: 0.0369 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:03:37,197] [INFO] [logging.py:107:log_dist] [Rank 0] step=3656, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3656 loss: 0.0679 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:03:47,874] [INFO] [logging.py:107:log_dist] [Rank 0] step=3657, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3657 loss: 0.3063 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 06:03:58,547] [INFO] [logging.py:107:log_dist] [Rank 0] step=3658, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3658 loss: 0.0587 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:04:09,252] [INFO] [logging.py:107:log_dist] [Rank 0] step=3659, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3659 loss: 0.1101 iter time (s): 10.674 samples/sec: 0.094 +[2025-05-06 06:04:19,925] [INFO] [logging.py:107:log_dist] [Rank 0] step=3660, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3660 loss: 0.0340 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:04:30,596] [INFO] [logging.py:107:log_dist] [Rank 0] step=3661, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3661 loss: 0.0401 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:04:41,274] [INFO] [logging.py:107:log_dist] [Rank 0] step=3662, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3662 loss: 0.1249 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 06:04:52,116] [INFO] [logging.py:107:log_dist] [Rank 0] step=3663, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3663 loss: 0.0352 iter time (s): 10.810 samples/sec: 0.093 +[2025-05-06 06:05:02,789] [INFO] [logging.py:107:log_dist] [Rank 0] step=3664, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3664 loss: 0.0552 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:05:13,464] [INFO] [logging.py:107:log_dist] [Rank 0] step=3665, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3665 loss: 0.3322 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:05:24,135] [INFO] [logging.py:107:log_dist] [Rank 0] step=3666, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3666 loss: 0.0640 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:05:34,806] [INFO] [logging.py:107:log_dist] [Rank 0] step=3667, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3667 loss: 0.1525 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:05:45,479] [INFO] [logging.py:107:log_dist] [Rank 0] step=3668, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3668 loss: 0.3529 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:05:56,152] [INFO] [logging.py:107:log_dist] [Rank 0] step=3669, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3669 loss: 0.0754 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:06:06,828] [INFO] [logging.py:107:log_dist] [Rank 0] step=3670, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3670 loss: 0.1410 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:06:17,691] [INFO] [logging.py:107:log_dist] [Rank 0] step=3671, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3671 loss: 0.0315 iter time (s): 10.833 samples/sec: 0.092 +[2025-05-06 06:06:28,362] [INFO] [logging.py:107:log_dist] [Rank 0] step=3672, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3672 loss: 0.0494 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:06:39,035] [INFO] [logging.py:107:log_dist] [Rank 0] step=3673, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3673 loss: 0.0297 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:06:49,710] [INFO] [logging.py:107:log_dist] [Rank 0] step=3674, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3674 loss: 0.0334 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:07:00,384] [INFO] [logging.py:107:log_dist] [Rank 0] step=3675, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3675 loss: 0.2004 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:07:11,054] [INFO] [logging.py:107:log_dist] [Rank 0] step=3676, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3676 loss: 0.0399 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:07:21,730] [INFO] [logging.py:107:log_dist] [Rank 0] step=3677, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3677 loss: 0.1417 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:07:32,404] [INFO] [logging.py:107:log_dist] [Rank 0] step=3678, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3678 loss: 0.0295 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:07:43,075] [INFO] [logging.py:107:log_dist] [Rank 0] step=3679, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3679 loss: 0.0611 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:07:53,942] [INFO] [logging.py:107:log_dist] [Rank 0] step=3680, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3680 loss: 0.0942 iter time (s): 10.836 samples/sec: 0.092 +[2025-05-06 06:08:04,619] [INFO] [logging.py:107:log_dist] [Rank 0] step=3681, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3681 loss: 0.2153 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:08:15,288] [INFO] [logging.py:107:log_dist] [Rank 0] step=3682, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3682 loss: 0.0370 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:08:25,968] [INFO] [logging.py:107:log_dist] [Rank 0] step=3683, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3683 loss: 0.0994 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 06:08:36,638] [INFO] [logging.py:107:log_dist] [Rank 0] step=3684, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3684 loss: 0.0481 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:08:47,308] [INFO] [logging.py:107:log_dist] [Rank 0] step=3685, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3685 loss: 0.0400 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:08:57,987] [INFO] [logging.py:107:log_dist] [Rank 0] step=3686, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3686 loss: 0.0501 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 06:09:08,655] [INFO] [logging.py:107:log_dist] [Rank 0] step=3687, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3687 loss: 0.2065 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:09:19,323] [INFO] [logging.py:107:log_dist] [Rank 0] step=3688, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3688 loss: 0.0630 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:09:30,154] [INFO] [logging.py:107:log_dist] [Rank 0] step=3689, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3689 loss: 0.0602 iter time (s): 10.801 samples/sec: 0.093 +[2025-05-06 06:09:40,819] [INFO] [logging.py:107:log_dist] [Rank 0] step=3690, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3690 loss: 0.4886 iter time (s): 10.639 samples/sec: 0.094 +Saving model to directory epoch90 +Started new epoch: 91 +[2025-05-06 06:09:53,417] [INFO] [logging.py:107:log_dist] [Rank 0] step=3691, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3691 loss: 0.0329 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:10:04,098] [INFO] [logging.py:107:log_dist] [Rank 0] step=3692, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3692 loss: 0.0709 iter time (s): 10.651 samples/sec: 0.094 +[2025-05-06 06:10:14,769] [INFO] [logging.py:107:log_dist] [Rank 0] step=3693, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3693 loss: 0.0382 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:10:25,439] [INFO] [logging.py:107:log_dist] [Rank 0] step=3694, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3694 loss: 0.0310 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:10:36,113] [INFO] [logging.py:107:log_dist] [Rank 0] step=3695, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3695 loss: 0.2111 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:10:46,786] [INFO] [logging.py:107:log_dist] [Rank 0] step=3696, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3696 loss: 0.0801 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:10:57,646] [INFO] [logging.py:107:log_dist] [Rank 0] step=3697, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3697 loss: 0.0665 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 06:11:08,329] [INFO] [logging.py:107:log_dist] [Rank 0] step=3698, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3698 loss: 0.0348 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-06 06:11:19,007] [INFO] [logging.py:107:log_dist] [Rank 0] step=3699, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3699 loss: 0.0422 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:11:29,675] [INFO] [logging.py:107:log_dist] [Rank 0] step=3700, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3700 loss: 0.0500 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:11:40,349] [INFO] [logging.py:107:log_dist] [Rank 0] step=3701, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3701 loss: 0.0503 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:11:51,021] [INFO] [logging.py:107:log_dist] [Rank 0] step=3702, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3702 loss: 0.0832 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:12:01,705] [INFO] [logging.py:107:log_dist] [Rank 0] step=3703, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3703 loss: 0.0358 iter time (s): 10.653 samples/sec: 0.094 +[2025-05-06 06:12:12,383] [INFO] [logging.py:107:log_dist] [Rank 0] step=3704, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3704 loss: 0.0925 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:12:23,055] [INFO] [logging.py:107:log_dist] [Rank 0] step=3705, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3705 loss: 0.0220 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:12:33,917] [INFO] [logging.py:107:log_dist] [Rank 0] step=3706, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3706 loss: 0.1073 iter time (s): 10.831 samples/sec: 0.092 +[2025-05-06 06:12:44,589] [INFO] [logging.py:107:log_dist] [Rank 0] step=3707, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3707 loss: 0.2405 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:12:55,258] [INFO] [logging.py:107:log_dist] [Rank 0] step=3708, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3708 loss: 0.1287 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:13:05,934] [INFO] [logging.py:107:log_dist] [Rank 0] step=3709, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3709 loss: 0.0554 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:13:16,607] [INFO] [logging.py:107:log_dist] [Rank 0] step=3710, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3710 loss: 0.1515 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:13:27,277] [INFO] [logging.py:107:log_dist] [Rank 0] step=3711, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3711 loss: 0.0319 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:13:37,954] [INFO] [logging.py:107:log_dist] [Rank 0] step=3712, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3712 loss: 0.0568 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 06:13:48,626] [INFO] [logging.py:107:log_dist] [Rank 0] step=3713, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3713 loss: 0.0336 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:13:59,461] [INFO] [logging.py:107:log_dist] [Rank 0] step=3714, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3714 loss: 0.0676 iter time (s): 10.804 samples/sec: 0.093 +[2025-05-06 06:14:10,139] [INFO] [logging.py:107:log_dist] [Rank 0] step=3715, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3715 loss: 0.0406 iter time (s): 10.647 samples/sec: 0.094 +[2025-05-06 06:14:20,813] [INFO] [logging.py:107:log_dist] [Rank 0] step=3716, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3716 loss: 0.0691 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:14:31,484] [INFO] [logging.py:107:log_dist] [Rank 0] step=3717, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3717 loss: 0.1081 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:14:42,161] [INFO] [logging.py:107:log_dist] [Rank 0] step=3718, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3718 loss: 0.0860 iter time (s): 10.646 samples/sec: 0.094 +[2025-05-06 06:14:52,834] [INFO] [logging.py:107:log_dist] [Rank 0] step=3719, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3719 loss: 0.0432 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:15:03,506] [INFO] [logging.py:107:log_dist] [Rank 0] step=3720, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3720 loss: 0.0520 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:15:14,179] [INFO] [logging.py:107:log_dist] [Rank 0] step=3721, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3721 loss: 0.0336 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:15:24,850] [INFO] [logging.py:107:log_dist] [Rank 0] step=3722, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3722 loss: 0.0329 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:15:35,679] [INFO] [logging.py:107:log_dist] [Rank 0] step=3723, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3723 loss: 0.0285 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-06 06:15:46,359] [INFO] [logging.py:107:log_dist] [Rank 0] step=3724, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3724 loss: 0.3986 iter time (s): 10.648 samples/sec: 0.094 +[2025-05-06 06:15:57,028] [INFO] [logging.py:107:log_dist] [Rank 0] step=3725, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3725 loss: 0.2825 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:16:07,696] [INFO] [logging.py:107:log_dist] [Rank 0] step=3726, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3726 loss: 0.0269 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 06:16:18,367] [INFO] [logging.py:107:log_dist] [Rank 0] step=3727, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3727 loss: 0.0607 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:16:29,042] [INFO] [logging.py:107:log_dist] [Rank 0] step=3728, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3728 loss: 0.1316 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:16:39,709] [INFO] [logging.py:107:log_dist] [Rank 0] step=3729, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3729 loss: 0.0721 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:16:50,381] [INFO] [logging.py:107:log_dist] [Rank 0] step=3730, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3730 loss: 0.0291 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:17:01,235] [INFO] [logging.py:107:log_dist] [Rank 0] step=3731, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3731 loss: 0.0431 iter time (s): 10.828 samples/sec: 0.092 +Started new epoch: 92 +[2025-05-06 06:17:12,250] [INFO] [logging.py:107:log_dist] [Rank 0] step=3732, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3732 loss: 0.0951 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:17:22,924] [INFO] [logging.py:107:log_dist] [Rank 0] step=3733, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3733 loss: 0.0556 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:17:33,596] [INFO] [logging.py:107:log_dist] [Rank 0] step=3734, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3734 loss: 0.0500 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:17:44,266] [INFO] [logging.py:107:log_dist] [Rank 0] step=3735, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3735 loss: 0.0590 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:17:54,941] [INFO] [logging.py:107:log_dist] [Rank 0] step=3736, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3736 loss: 0.0283 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:18:05,612] [INFO] [logging.py:107:log_dist] [Rank 0] step=3737, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3737 loss: 0.0378 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:18:16,280] [INFO] [logging.py:107:log_dist] [Rank 0] step=3738, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3738 loss: 0.0836 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:18:26,960] [INFO] [logging.py:107:log_dist] [Rank 0] step=3739, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3739 loss: 0.1220 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 06:18:37,819] [INFO] [logging.py:107:log_dist] [Rank 0] step=3740, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3740 loss: 0.2407 iter time (s): 10.828 samples/sec: 0.092 +[2025-05-06 06:18:48,489] [INFO] [logging.py:107:log_dist] [Rank 0] step=3741, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3741 loss: 0.0433 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:18:59,163] [INFO] [logging.py:107:log_dist] [Rank 0] step=3742, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3742 loss: 0.1447 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:19:09,840] [INFO] [logging.py:107:log_dist] [Rank 0] step=3743, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3743 loss: 0.0786 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:19:20,508] [INFO] [logging.py:107:log_dist] [Rank 0] step=3744, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3744 loss: 0.3851 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:19:31,182] [INFO] [logging.py:107:log_dist] [Rank 0] step=3745, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3745 loss: 0.0795 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:19:41,850] [INFO] [logging.py:107:log_dist] [Rank 0] step=3746, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3746 loss: 0.0527 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:19:52,518] [INFO] [logging.py:107:log_dist] [Rank 0] step=3747, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3747 loss: 0.0255 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:20:03,194] [INFO] [logging.py:107:log_dist] [Rank 0] step=3748, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3748 loss: 0.0377 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:20:14,021] [INFO] [logging.py:107:log_dist] [Rank 0] step=3749, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3749 loss: 0.0909 iter time (s): 10.795 samples/sec: 0.093 +[2025-05-06 06:20:24,693] [INFO] [logging.py:107:log_dist] [Rank 0] step=3750, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3750 loss: 0.2347 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:20:35,367] [INFO] [logging.py:107:log_dist] [Rank 0] step=3751, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3751 loss: 0.0333 iter time (s): 10.643 samples/sec: 0.094 +[2025-05-06 06:20:46,036] [INFO] [logging.py:107:log_dist] [Rank 0] step=3752, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3752 loss: 0.0716 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:20:56,715] [INFO] [logging.py:107:log_dist] [Rank 0] step=3753, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3753 loss: 0.0487 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:21:07,390] [INFO] [logging.py:107:log_dist] [Rank 0] step=3754, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3754 loss: 0.0323 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:21:18,062] [INFO] [logging.py:107:log_dist] [Rank 0] step=3755, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3755 loss: 0.1010 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:21:28,732] [INFO] [logging.py:107:log_dist] [Rank 0] step=3756, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3756 loss: 0.2550 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:21:39,575] [INFO] [logging.py:107:log_dist] [Rank 0] step=3757, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3757 loss: 0.0468 iter time (s): 10.812 samples/sec: 0.092 +[2025-05-06 06:21:50,245] [INFO] [logging.py:107:log_dist] [Rank 0] step=3758, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3758 loss: 0.1198 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:22:00,927] [INFO] [logging.py:107:log_dist] [Rank 0] step=3759, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3759 loss: 0.0667 iter time (s): 10.652 samples/sec: 0.094 +[2025-05-06 06:22:11,600] [INFO] [logging.py:107:log_dist] [Rank 0] step=3760, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3760 loss: 0.0381 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:22:22,285] [INFO] [logging.py:107:log_dist] [Rank 0] step=3761, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3761 loss: 0.0290 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:22:32,957] [INFO] [logging.py:107:log_dist] [Rank 0] step=3762, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3762 loss: 0.0528 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:22:43,632] [INFO] [logging.py:107:log_dist] [Rank 0] step=3763, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3763 loss: 0.0902 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:22:54,301] [INFO] [logging.py:107:log_dist] [Rank 0] step=3764, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3764 loss: 0.1204 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:23:04,977] [INFO] [logging.py:107:log_dist] [Rank 0] step=3765, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3765 loss: 0.0827 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:23:15,862] [INFO] [logging.py:107:log_dist] [Rank 0] step=3766, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3766 loss: 0.0282 iter time (s): 10.851 samples/sec: 0.092 +[2025-05-06 06:23:26,539] [INFO] [logging.py:107:log_dist] [Rank 0] step=3767, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3767 loss: 0.0308 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:23:37,214] [INFO] [logging.py:107:log_dist] [Rank 0] step=3768, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3768 loss: 0.1968 iter time (s): 10.644 samples/sec: 0.094 +[2025-05-06 06:23:47,883] [INFO] [logging.py:107:log_dist] [Rank 0] step=3769, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3769 loss: 0.0369 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:23:58,551] [INFO] [logging.py:107:log_dist] [Rank 0] step=3770, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3770 loss: 0.0569 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:24:09,224] [INFO] [logging.py:107:log_dist] [Rank 0] step=3771, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3771 loss: 0.0296 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:24:19,886] [INFO] [logging.py:107:log_dist] [Rank 0] step=3772, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3772 loss: 0.0610 iter time (s): 10.636 samples/sec: 0.094 +Started new epoch: 93 +[2025-05-06 06:24:30,892] [INFO] [logging.py:107:log_dist] [Rank 0] step=3773, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3773 loss: 0.0276 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:24:41,753] [INFO] [logging.py:107:log_dist] [Rank 0] step=3774, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3774 loss: 0.1505 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-06 06:24:52,421] [INFO] [logging.py:107:log_dist] [Rank 0] step=3775, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3775 loss: 0.0395 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:25:03,092] [INFO] [logging.py:107:log_dist] [Rank 0] step=3776, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3776 loss: 0.0452 iter time (s): 10.640 samples/sec: 0.094 +[2025-05-06 06:25:13,768] [INFO] [logging.py:107:log_dist] [Rank 0] step=3777, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3777 loss: 0.0752 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:25:24,437] [INFO] [logging.py:107:log_dist] [Rank 0] step=3778, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3778 loss: 0.0890 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:25:35,107] [INFO] [logging.py:107:log_dist] [Rank 0] step=3779, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3779 loss: 0.1295 iter time (s): 10.639 samples/sec: 0.094 +[2025-05-06 06:25:45,787] [INFO] [logging.py:107:log_dist] [Rank 0] step=3780, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3780 loss: 0.0853 iter time (s): 10.649 samples/sec: 0.094 +[2025-05-06 06:25:56,460] [INFO] [logging.py:107:log_dist] [Rank 0] step=3781, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3781 loss: 0.0446 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:26:07,128] [INFO] [logging.py:107:log_dist] [Rank 0] step=3782, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3782 loss: 0.0353 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:26:17,960] [INFO] [logging.py:107:log_dist] [Rank 0] step=3783, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3783 loss: 0.0834 iter time (s): 10.802 samples/sec: 0.093 +[2025-05-06 06:26:28,618] [INFO] [logging.py:107:log_dist] [Rank 0] step=3784, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3784 loss: 0.0456 iter time (s): 10.626 samples/sec: 0.094 +[2025-05-06 06:26:39,278] [INFO] [logging.py:107:log_dist] [Rank 0] step=3785, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3785 loss: 0.0332 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:26:49,937] [INFO] [logging.py:107:log_dist] [Rank 0] step=3786, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3786 loss: 0.0466 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 06:27:00,597] [INFO] [logging.py:107:log_dist] [Rank 0] step=3787, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3787 loss: 0.0649 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:27:11,254] [INFO] [logging.py:107:log_dist] [Rank 0] step=3788, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3788 loss: 0.2876 iter time (s): 10.626 samples/sec: 0.094 +[2025-05-06 06:27:21,914] [INFO] [logging.py:107:log_dist] [Rank 0] step=3789, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3789 loss: 0.0399 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 06:27:32,578] [INFO] [logging.py:107:log_dist] [Rank 0] step=3790, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3790 loss: 0.2405 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 06:27:43,425] [INFO] [logging.py:107:log_dist] [Rank 0] step=3791, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3791 loss: 0.2061 iter time (s): 10.817 samples/sec: 0.092 +[2025-05-06 06:27:54,089] [INFO] [logging.py:107:log_dist] [Rank 0] step=3792, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3792 loss: 0.0404 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 06:28:04,749] [INFO] [logging.py:107:log_dist] [Rank 0] step=3793, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3793 loss: 0.0518 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:28:15,407] [INFO] [logging.py:107:log_dist] [Rank 0] step=3794, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3794 loss: 0.0620 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 06:28:26,073] [INFO] [logging.py:107:log_dist] [Rank 0] step=3795, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3795 loss: 0.1449 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 06:28:36,741] [INFO] [logging.py:107:log_dist] [Rank 0] step=3796, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3796 loss: 0.0615 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:28:47,408] [INFO] [logging.py:107:log_dist] [Rank 0] step=3797, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3797 loss: 0.1956 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 06:28:58,070] [INFO] [logging.py:107:log_dist] [Rank 0] step=3798, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3798 loss: 0.0916 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 06:29:08,735] [INFO] [logging.py:107:log_dist] [Rank 0] step=3799, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3799 loss: 0.0291 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 06:29:19,596] [INFO] [logging.py:107:log_dist] [Rank 0] step=3800, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3800 loss: 0.0342 iter time (s): 10.830 samples/sec: 0.092 +[2025-05-06 06:29:30,260] [INFO] [logging.py:107:log_dist] [Rank 0] step=3801, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3801 loss: 0.0529 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 06:29:40,919] [INFO] [logging.py:107:log_dist] [Rank 0] step=3802, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3802 loss: 0.1925 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:29:51,581] [INFO] [logging.py:107:log_dist] [Rank 0] step=3803, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3803 loss: 0.0361 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:30:02,247] [INFO] [logging.py:107:log_dist] [Rank 0] step=3804, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3804 loss: 0.0352 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 06:30:12,909] [INFO] [logging.py:107:log_dist] [Rank 0] step=3805, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3805 loss: 0.0624 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 06:30:23,568] [INFO] [logging.py:107:log_dist] [Rank 0] step=3806, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3806 loss: 0.2387 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:30:34,245] [INFO] [logging.py:107:log_dist] [Rank 0] step=3807, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3807 loss: 0.0372 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 06:30:45,088] [INFO] [logging.py:107:log_dist] [Rank 0] step=3808, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3808 loss: 0.0611 iter time (s): 10.811 samples/sec: 0.093 +[2025-05-06 06:30:55,743] [INFO] [logging.py:107:log_dist] [Rank 0] step=3809, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3809 loss: 0.1144 iter time (s): 10.625 samples/sec: 0.094 +[2025-05-06 06:31:06,403] [INFO] [logging.py:107:log_dist] [Rank 0] step=3810, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3810 loss: 0.2520 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 06:31:17,059] [INFO] [logging.py:107:log_dist] [Rank 0] step=3811, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3811 loss: 0.0470 iter time (s): 10.626 samples/sec: 0.094 +[2025-05-06 06:31:27,731] [INFO] [logging.py:107:log_dist] [Rank 0] step=3812, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3812 loss: 0.0428 iter time (s): 10.642 samples/sec: 0.094 +[2025-05-06 06:31:38,390] [INFO] [logging.py:107:log_dist] [Rank 0] step=3813, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3813 loss: 0.0414 iter time (s): 10.632 samples/sec: 0.094 +Started new epoch: 94 +[2025-05-06 06:31:49,394] [INFO] [logging.py:107:log_dist] [Rank 0] step=3814, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3814 loss: 0.0441 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:32:00,057] [INFO] [logging.py:107:log_dist] [Rank 0] step=3815, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3815 loss: 0.1172 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:32:10,719] [INFO] [logging.py:107:log_dist] [Rank 0] step=3816, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3816 loss: 0.0363 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 06:32:21,544] [INFO] [logging.py:107:log_dist] [Rank 0] step=3817, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3817 loss: 0.0627 iter time (s): 10.795 samples/sec: 0.093 +[2025-05-06 06:32:32,206] [INFO] [logging.py:107:log_dist] [Rank 0] step=3818, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3818 loss: 0.0287 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 06:32:42,867] [INFO] [logging.py:107:log_dist] [Rank 0] step=3819, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3819 loss: 0.0379 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:32:53,530] [INFO] [logging.py:107:log_dist] [Rank 0] step=3820, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3820 loss: 0.1142 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:33:04,190] [INFO] [logging.py:107:log_dist] [Rank 0] step=3821, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3821 loss: 0.1941 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:33:14,850] [INFO] [logging.py:107:log_dist] [Rank 0] step=3822, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3822 loss: 0.0374 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:33:25,509] [INFO] [logging.py:107:log_dist] [Rank 0] step=3823, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3823 loss: 0.4982 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:33:36,172] [INFO] [logging.py:107:log_dist] [Rank 0] step=3824, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3824 loss: 0.1425 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:33:46,831] [INFO] [logging.py:107:log_dist] [Rank 0] step=3825, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3825 loss: 0.0380 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:33:57,652] [INFO] [logging.py:107:log_dist] [Rank 0] step=3826, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3826 loss: 0.0802 iter time (s): 10.790 samples/sec: 0.093 +[2025-05-06 06:34:08,318] [INFO] [logging.py:107:log_dist] [Rank 0] step=3827, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3827 loss: 0.2099 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 06:34:18,980] [INFO] [logging.py:107:log_dist] [Rank 0] step=3828, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3828 loss: 0.0580 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 06:34:29,640] [INFO] [logging.py:107:log_dist] [Rank 0] step=3829, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3829 loss: 0.0428 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:34:40,304] [INFO] [logging.py:107:log_dist] [Rank 0] step=3830, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3830 loss: 0.0288 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:34:50,973] [INFO] [logging.py:107:log_dist] [Rank 0] step=3831, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3831 loss: 0.1864 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:35:01,637] [INFO] [logging.py:107:log_dist] [Rank 0] step=3832, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3832 loss: 0.1229 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:35:12,300] [INFO] [logging.py:107:log_dist] [Rank 0] step=3833, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3833 loss: 0.0540 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:35:23,150] [INFO] [logging.py:107:log_dist] [Rank 0] step=3834, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3834 loss: 0.1023 iter time (s): 10.818 samples/sec: 0.092 +[2025-05-06 06:35:33,810] [INFO] [logging.py:107:log_dist] [Rank 0] step=3835, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3835 loss: 0.0314 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:35:44,474] [INFO] [logging.py:107:log_dist] [Rank 0] step=3836, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3836 loss: 0.1081 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 06:35:55,132] [INFO] [logging.py:107:log_dist] [Rank 0] step=3837, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3837 loss: 0.1484 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:36:05,794] [INFO] [logging.py:107:log_dist] [Rank 0] step=3838, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3838 loss: 0.0589 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 06:36:16,455] [INFO] [logging.py:107:log_dist] [Rank 0] step=3839, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3839 loss: 0.0341 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 06:36:27,118] [INFO] [logging.py:107:log_dist] [Rank 0] step=3840, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3840 loss: 0.0862 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:36:37,779] [INFO] [logging.py:107:log_dist] [Rank 0] step=3841, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3841 loss: 0.0669 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:36:48,447] [INFO] [logging.py:107:log_dist] [Rank 0] step=3842, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3842 loss: 0.0280 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:36:59,266] [INFO] [logging.py:107:log_dist] [Rank 0] step=3843, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3843 loss: 0.1031 iter time (s): 10.787 samples/sec: 0.093 +[2025-05-06 06:37:09,924] [INFO] [logging.py:107:log_dist] [Rank 0] step=3844, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3844 loss: 0.0366 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:37:20,590] [INFO] [logging.py:107:log_dist] [Rank 0] step=3845, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3845 loss: 0.0520 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 06:37:31,250] [INFO] [logging.py:107:log_dist] [Rank 0] step=3846, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3846 loss: 0.0320 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:37:41,912] [INFO] [logging.py:107:log_dist] [Rank 0] step=3847, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3847 loss: 0.0327 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 06:37:52,578] [INFO] [logging.py:107:log_dist] [Rank 0] step=3848, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3848 loss: 0.1997 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 06:38:03,236] [INFO] [logging.py:107:log_dist] [Rank 0] step=3849, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3849 loss: 0.1338 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 06:38:13,896] [INFO] [logging.py:107:log_dist] [Rank 0] step=3850, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3850 loss: 0.0892 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 06:38:24,559] [INFO] [logging.py:107:log_dist] [Rank 0] step=3851, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3851 loss: 0.2018 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 06:38:35,402] [INFO] [logging.py:107:log_dist] [Rank 0] step=3852, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3852 loss: 0.0831 iter time (s): 10.813 samples/sec: 0.092 +[2025-05-06 06:38:46,061] [INFO] [logging.py:107:log_dist] [Rank 0] step=3853, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3853 loss: 0.0933 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:38:56,720] [INFO] [logging.py:107:log_dist] [Rank 0] step=3854, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3854 loss: 0.0437 iter time (s): 10.633 samples/sec: 0.094 +Started new epoch: 95 +[2025-05-06 06:39:07,720] [INFO] [logging.py:107:log_dist] [Rank 0] step=3855, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3855 loss: 0.0745 iter time (s): 10.631 samples/sec: 0.094 +[2025-05-06 06:39:18,386] [INFO] [logging.py:107:log_dist] [Rank 0] step=3856, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3856 loss: 0.0342 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 06:39:29,052] [INFO] [logging.py:107:log_dist] [Rank 0] step=3857, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3857 loss: 0.0399 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 06:39:39,719] [INFO] [logging.py:107:log_dist] [Rank 0] step=3858, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3858 loss: 0.0745 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:39:50,378] [INFO] [logging.py:107:log_dist] [Rank 0] step=3859, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3859 loss: 0.1148 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:40:01,206] [INFO] [logging.py:107:log_dist] [Rank 0] step=3860, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3860 loss: 0.0429 iter time (s): 10.797 samples/sec: 0.093 +[2025-05-06 06:40:11,874] [INFO] [logging.py:107:log_dist] [Rank 0] step=3861, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3861 loss: 0.0680 iter time (s): 10.637 samples/sec: 0.094 +[2025-05-06 06:40:22,533] [INFO] [logging.py:107:log_dist] [Rank 0] step=3862, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3862 loss: 0.0277 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:40:33,199] [INFO] [logging.py:107:log_dist] [Rank 0] step=3863, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3863 loss: 0.2063 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 06:40:43,860] [INFO] [logging.py:107:log_dist] [Rank 0] step=3864, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3864 loss: 0.1694 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 06:40:54,520] [INFO] [logging.py:107:log_dist] [Rank 0] step=3865, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3865 loss: 0.1308 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:41:05,178] [INFO] [logging.py:107:log_dist] [Rank 0] step=3866, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3866 loss: 0.0371 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:41:15,836] [INFO] [logging.py:107:log_dist] [Rank 0] step=3867, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3867 loss: 0.1704 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 06:41:26,512] [INFO] [logging.py:107:log_dist] [Rank 0] step=3868, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3868 loss: 0.0542 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:41:37,338] [INFO] [logging.py:107:log_dist] [Rank 0] step=3869, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3869 loss: 0.0305 iter time (s): 10.795 samples/sec: 0.093 +[2025-05-06 06:41:47,995] [INFO] [logging.py:107:log_dist] [Rank 0] step=3870, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3870 loss: 0.0658 iter time (s): 10.626 samples/sec: 0.094 +[2025-05-06 06:41:58,672] [INFO] [logging.py:107:log_dist] [Rank 0] step=3871, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3871 loss: 0.1371 iter time (s): 10.645 samples/sec: 0.094 +[2025-05-06 06:42:09,334] [INFO] [logging.py:107:log_dist] [Rank 0] step=3872, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3872 loss: 0.0295 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:42:19,992] [INFO] [logging.py:107:log_dist] [Rank 0] step=3873, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3873 loss: 0.1256 iter time (s): 10.627 samples/sec: 0.094 +[2025-05-06 06:42:30,656] [INFO] [logging.py:107:log_dist] [Rank 0] step=3874, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3874 loss: 0.0613 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 06:42:41,323] [INFO] [logging.py:107:log_dist] [Rank 0] step=3875, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3875 loss: 0.0349 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 06:42:51,983] [INFO] [logging.py:107:log_dist] [Rank 0] step=3876, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3876 loss: 0.0702 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:43:02,860] [INFO] [logging.py:107:log_dist] [Rank 0] step=3877, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3877 loss: 0.0637 iter time (s): 10.847 samples/sec: 0.092 +[2025-05-06 06:43:13,520] [INFO] [logging.py:107:log_dist] [Rank 0] step=3878, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3878 loss: 0.0409 iter time (s): 10.629 samples/sec: 0.094 +[2025-05-06 06:43:24,186] [INFO] [logging.py:107:log_dist] [Rank 0] step=3879, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3879 loss: 0.0778 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 06:43:34,886] [INFO] [logging.py:107:log_dist] [Rank 0] step=3880, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3880 loss: 0.0823 iter time (s): 10.668 samples/sec: 0.094 +[2025-05-06 06:43:45,587] [INFO] [logging.py:107:log_dist] [Rank 0] step=3881, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3881 loss: 0.0692 iter time (s): 10.670 samples/sec: 0.094 +[2025-05-06 06:43:56,281] [INFO] [logging.py:107:log_dist] [Rank 0] step=3882, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3882 loss: 0.0494 iter time (s): 10.662 samples/sec: 0.094 +[2025-05-06 06:44:06,981] [INFO] [logging.py:107:log_dist] [Rank 0] step=3883, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3883 loss: 0.0587 iter time (s): 10.669 samples/sec: 0.094 +[2025-05-06 06:44:17,676] [INFO] [logging.py:107:log_dist] [Rank 0] step=3884, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3884 loss: 0.0444 iter time (s): 10.663 samples/sec: 0.094 +[2025-05-06 06:44:28,393] [INFO] [logging.py:107:log_dist] [Rank 0] step=3885, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3885 loss: 0.0254 iter time (s): 10.685 samples/sec: 0.094 +[2025-05-06 06:44:39,247] [INFO] [logging.py:107:log_dist] [Rank 0] step=3886, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3886 loss: 0.0663 iter time (s): 10.822 samples/sec: 0.092 +[2025-05-06 06:44:49,908] [INFO] [logging.py:107:log_dist] [Rank 0] step=3887, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3887 loss: 0.1025 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 06:45:00,572] [INFO] [logging.py:107:log_dist] [Rank 0] step=3888, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3888 loss: 0.0580 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:45:11,241] [INFO] [logging.py:107:log_dist] [Rank 0] step=3889, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3889 loss: 0.2410 iter time (s): 10.638 samples/sec: 0.094 +[2025-05-06 06:45:21,908] [INFO] [logging.py:107:log_dist] [Rank 0] step=3890, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3890 loss: 0.0542 iter time (s): 10.635 samples/sec: 0.094 +[2025-05-06 06:45:32,565] [INFO] [logging.py:107:log_dist] [Rank 0] step=3891, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3891 loss: 0.1207 iter time (s): 10.626 samples/sec: 0.094 +[2025-05-06 06:45:43,229] [INFO] [logging.py:107:log_dist] [Rank 0] step=3892, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3892 loss: 0.0341 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 06:45:53,893] [INFO] [logging.py:107:log_dist] [Rank 0] step=3893, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3893 loss: 0.0848 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:46:04,553] [INFO] [logging.py:107:log_dist] [Rank 0] step=3894, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3894 loss: 0.0333 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 06:46:15,377] [INFO] [logging.py:107:log_dist] [Rank 0] step=3895, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3895 loss: 0.0664 iter time (s): 10.797 samples/sec: 0.093 +Started new epoch: 96 +[2025-05-06 06:46:26,376] [INFO] [logging.py:107:log_dist] [Rank 0] step=3896, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3896 loss: 0.0482 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 06:46:37,040] [INFO] [logging.py:107:log_dist] [Rank 0] step=3897, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3897 loss: 0.0517 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:46:47,705] [INFO] [logging.py:107:log_dist] [Rank 0] step=3898, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3898 loss: 0.0354 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 06:46:58,369] [INFO] [logging.py:107:log_dist] [Rank 0] step=3899, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3899 loss: 0.0428 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 06:47:09,029] [INFO] [logging.py:107:log_dist] [Rank 0] step=3900, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3900 loss: 0.1160 iter time (s): 10.628 samples/sec: 0.094 +[2025-05-06 06:47:19,701] [INFO] [logging.py:107:log_dist] [Rank 0] step=3901, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3901 loss: 0.0305 iter time (s): 10.641 samples/sec: 0.094 +[2025-05-06 06:47:30,364] [INFO] [logging.py:107:log_dist] [Rank 0] step=3902, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3902 loss: 0.0702 iter time (s): 10.630 samples/sec: 0.094 +[2025-05-06 06:47:41,028] [INFO] [logging.py:107:log_dist] [Rank 0] step=3903, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3903 loss: 0.0558 iter time (s): 10.632 samples/sec: 0.094 +[2025-05-06 06:47:51,865] [INFO] [logging.py:107:log_dist] [Rank 0] step=3904, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3904 loss: 0.2702 iter time (s): 10.800 samples/sec: 0.093 +[2025-05-06 06:48:02,529] [INFO] [logging.py:107:log_dist] [Rank 0] step=3905, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3905 loss: 0.1152 iter time (s): 10.633 samples/sec: 0.094 +[2025-05-06 06:48:13,195] [INFO] [logging.py:107:log_dist] [Rank 0] step=3906, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3906 loss: 0.0960 iter time (s): 10.634 samples/sec: 0.094 +[2025-05-06 06:48:23,862] [INFO] [logging.py:107:log_dist] [Rank 0] step=3907, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3907 loss: 0.0591 iter time (s): 10.636 samples/sec: 0.094 +[2025-05-06 06:48:34,553] [INFO] [logging.py:107:log_dist] [Rank 0] step=3908, skipped=0, lr=[1e-05], mom=[0.0] +steps: 3908 loss: 0.0342 iter time (s): 10.660 samples/sec: 0.094 diff --git a/wandb/run-20250505_191035-lg5j0rns/files/requirements.txt b/wandb/run-20250505_191035-lg5j0rns/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..70c0507a8756345c8745b922901bde6889096f0b --- /dev/null +++ b/wandb/run-20250505_191035-lg5j0rns/files/requirements.txt @@ -0,0 +1,222 @@ +PyYAML==6.0.2 +asttokens==3.0.0 +certifi==2025.1.31 +charset-normalizer==3.4.1 +comm==0.2.2 +debugpy==1.8.13 +decorator==5.2.1 +executing==2.2.0 +filelock==3.17.0 +fsspec==2025.3.0 +huggingface-hub==0.29.3 +idna==3.10 +inquirerpy==0.3.4 +ipykernel==6.29.5 +ipython==9.0.2 +ipython_pygments_lexers==1.1.1 +ipywidgets==8.1.5 +jedi==0.19.2 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyterlab_widgets==3.0.13 +matplotlib-inline==0.1.7 +nest-asyncio==1.6.0 +packaging==24.2 +parso==0.8.4 +pexpect==4.9.0 +pfzy==0.3.4 +platformdirs==4.3.6 +prompt_toolkit==3.0.50 +psutil==7.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +Pygments==2.19.1 +python-dateutil==2.9.0.post0 +pyzmq==26.2.1 +requests==2.32.3 +six==1.17.0 +stack-data==0.6.3 +tornado==6.4.2 +tqdm==4.67.1 +traitlets==5.14.3 +typing_extensions==4.12.2 +wcwidth==0.2.13 +wheel==0.45.1 +widgetsnbextension==4.0.13 +setuptools==80.3.1 +pip==25.1.1 +text-unidecode==1.3 +websocket-client==1.8.0 +types-python-dateutil==2.9.0.20241206 +tomlkit==0.13.2 +semver==3.0.4 +ruff==0.11.8 +python-slugify==8.0.4 +pathspec==0.12.1 +binaryornot==0.4.4 +questionary==2.1.0 +mixpanel==4.10.1 +arrow==1.3.0 +cookiecutter==2.6.0 +comfy-cli==1.3.8 +zipp==3.21.0 +protobuf==6.30.2 +ftfy==6.3.1 +importlib_metadata==8.7.0 +diffusers==0.33.1 +accelerate==1.6.0 +ninja==1.11.1.4 +flash_attn==2.7.4.post1 +pytz==2025.2 +py-cpuinfo==9.0.0 +nvidia-ml-py==12.570.86 +hjson==3.1.0 +antlr4-python3-runtime==4.9.3 +xxhash==3.5.0 +Werkzeug==3.1.3 +tzdata==2025.2 +termcolor==3.1.0 +tensorboard-data-server==0.7.2 +setproctitle==1.3.6 +sentry-sdk==2.27.0 +pyarrow==20.0.0 +portalocker==3.1.1 +omegaconf==2.3.0 +msgpack==1.1.0 +Markdown==3.8 +loguru==0.7.3 +imageio-ffmpeg==0.6.0 +grpcio==1.71.0 +docker-pycreds==0.4.0 +dill==0.3.8 +absl-py==2.2.2 +tensorboard==2.19.0 +pandas==2.2.3 +multiprocess==0.70.16 +iopath==0.1.10 +hydra-core==1.3.2 +wandb==0.19.10 +torch-optimi==0.2.1 +pytorch_optimizer==3.5.1 +deepspeed==0.16.7 +datasets==3.5.1 +bitsandbytes==0.45.5 +peft==0.15.2 +ddt==1.7.2 +pyparsing==3.2.3 +mss==10.0.0 +kiwisolver==1.4.8 +fonttools==4.57.0 +docutils==0.21.2 +cycler==0.12.1 +contourpy==1.3.2 +matplotlib==3.10.1 +color-matcher==0.6.0 +Deprecated==1.2.18 +GitPython==3.1.44 +PyGithub==2.6.1 +PyJWT==2.10.1 +PyMatting==1.1.13 +PyNaCl==1.5.0 +PySocks==1.7.1 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +albucore==0.0.16 +albumentations==1.4.15 +annotated-types==0.7.0 +anyio==4.9.0 +attrs==25.3.0 +av==14.2.0 +beautifulsoup4==4.13.3 +cffi==1.17.1 +chardet==5.2.0 +click==8.1.8 +colour-science==0.4.6 +comfyui_frontend_package==1.11.8 +cryptography==44.0.2 +easydict==1.13 +einops==0.8.1 +eval_type_backport==0.2.2 +flet==0.27.6 +frozenlist==1.5.0 +gdown==5.2.0 +gitdb==4.0.12 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +imageio==2.37.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kornia==0.8.0 +kornia_rs==0.1.8 +lazy_loader==0.4 +llvmlite==0.44.0 +markdown-it-py==3.0.0 +matrix-client==0.4.0 +mdurl==0.1.2 +multidict==6.2.0 +numba==0.61.0 +numpy==1.26.4 +oauthlib==3.2.2 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +pixeloe==0.1.4 +pooch==1.8.2 +propcache==0.3.0 +pycparser==2.22 +pydantic==2.10.6 +pydantic_core==2.27.2 +referencing==0.36.2 +regex==2024.11.6 +rembg==2.0.65 +repath==0.9.0 +rich==13.9.4 +rpds-py==0.23.1 +safetensors==0.5.3 +scikit-image==0.25.2 +scipy==1.15.2 +sentencepiece==0.2.0 +shellingham==1.5.4 +smmap==5.0.2 +sniffio==1.3.1 +soundfile==0.13.1 +soupsieve==2.6 +spandrel==0.4.1 +tifffile==2025.3.13 +timm==1.0.15 +tokenizers==0.21.1 +toml==0.10.2 +torchsde==0.2.6 +trampoline==0.1.2 +transformers==4.49.0 +transparent-background==1.3.3 +typer==0.15.2 +urllib3==1.26.20 +uv==0.6.7 +wget==3.2 +wrapt==1.17.2 +xformers==0.0.29.post1 +yarl==1.18.3 +MarkupSafe==2.1.5 +Jinja2==3.1.4 +mpmath==1.3.0 +networkx==3.3 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.1.105 +nvidia-nvtx-cu12==12.1.105 +pillow==11.0.0 +sympy==1.13.1 +torch==2.5.1+cu121 +torchaudio==2.5.1+cu121 +torchvision==0.20.1+cu121 +triton==3.1.0 diff --git a/wandb/run-20250505_191035-lg5j0rns/files/wandb-metadata.json b/wandb/run-20250505_191035-lg5j0rns/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..49077fe295ccccf0e7aa347d9b6c986ae2f0cbe7 --- /dev/null +++ b/wandb/run-20250505_191035-lg5j0rns/files/wandb-metadata.json @@ -0,0 +1,48 @@ +{ + "os": "Linux-5.4.0-205-generic-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-05-05T19:10:35.749212Z", + "args": [ + "--local_rank=0", + "--deepspeed", + "--config", + "/workspace/configs/wan.toml" + ], + "program": "/workspace/diffusion-pipe/train.py", + "codePath": "train.py", + "git": { + "remote": "https://github.com/tdrussell/diffusion-pipe", + "commit": "fdf265eb9920c3675d89902550867dcc3e678616" + }, + "email": "santtu.keskinen@gmail.com", + "root": "/workspace/ComfyUI/models/loras/out/20250505_19-10-35", + "host": "420c94ca0326", + "executable": "/venv/main/bin/python3.11", + "codePathLocal": "train.py", + "cpu_count": 6, + "cpu_count_logical": 12, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 1, + "disk": { + "/": { + "total": "136365211648", + "used": "67060051968" + } + }, + "memory": { + "total": "67260588032" + }, + "cpu": { + "count": 6, + "countLogical": 12 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + } + ], + "cudaVersion": "12.3" +} \ No newline at end of file diff --git a/wandb/run-20250505_191035-lg5j0rns/logs/debug-core.log b/wandb/run-20250505_191035-lg5j0rns/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..3c4401342c182fa0afe110be15a0bd87b43bd2f1 --- /dev/null +++ b/wandb/run-20250505_191035-lg5j0rns/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-05-05T19:10:35.409569814Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpqq0k6baw/port-20093.txt","pid":20093,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-05-05T19:10:35.415207733Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":20093} +{"time":"2025-05-05T19:10:35.415202103Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":38199,"Zone":""}} +{"time":"2025-05-05T19:10:35.58931366Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:32994"} +{"time":"2025-05-05T19:10:35.752772263Z","level":"INFO","msg":"handleInformInit: received","streamId":"lg5j0rns","id":"127.0.0.1:32994"} +{"time":"2025-05-05T19:10:35.9661651Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"lg5j0rns","id":"127.0.0.1:32994"} +{"time":"2025-05-06T06:48:44.786763998Z","level":"INFO","msg":"received shutdown signal","signal":15} diff --git a/wandb/run-20250505_191035-lg5j0rns/logs/debug-internal.log b/wandb/run-20250505_191035-lg5j0rns/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..203156c8edb0f901705f25fa9530914c25d8348a --- /dev/null +++ b/wandb/run-20250505_191035-lg5j0rns/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-05-05T19:10:35.753351658Z","level":"INFO","msg":"stream: starting","core version":"0.19.10","symlink path":"/workspace/ComfyUI/models/loras/out/20250505_19-10-35/wandb/run-20250505_191035-lg5j0rns/logs/debug-core.log"} +{"time":"2025-05-05T19:10:35.966072548Z","level":"INFO","msg":"created new stream","id":"lg5j0rns"} +{"time":"2025-05-05T19:10:35.966148127Z","level":"INFO","msg":"stream: started","id":"lg5j0rns"} +{"time":"2025-05-05T19:10:35.966212213Z","level":"INFO","msg":"writer: Do: started","stream_id":"lg5j0rns"} +{"time":"2025-05-05T19:10:35.966242016Z","level":"INFO","msg":"handler: started","stream_id":"lg5j0rns"} +{"time":"2025-05-05T19:10:35.966521812Z","level":"INFO","msg":"sender: started","stream_id":"lg5j0rns"} +{"time":"2025-05-05T19:10:36.117361836Z","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20250505_191035-lg5j0rns/logs/debug.log b/wandb/run-20250505_191035-lg5j0rns/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2f39708c0bfaede2fc1a8259b7811f14ff9447b3 --- /dev/null +++ b/wandb/run-20250505_191035-lg5j0rns/logs/debug.log @@ -0,0 +1,22 @@ +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_setup.py:_flush():68] Current SDK version is 0.19.10 +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_setup.py:_flush():68] Configure stats pid to 20093 +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_setup.py:_flush():68] Loading settings from /root/.config/wandb/settings +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_setup.py:_flush():68] Loading settings from /workspace/diffusion-pipe/wandb/settings +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_setup.py:_flush():68] Loading settings from environment variables +2025-05-05 19:10:35,744 INFO MainThread:20093 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /workspace/ComfyUI/models/loras/out/20250505_19-10-35/wandb/run-20250505_191035-lg5j0rns/logs/debug.log +2025-05-05 19:10:35,745 INFO MainThread:20093 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /workspace/ComfyUI/models/loras/out/20250505_19-10-35/wandb/run-20250505_191035-lg5j0rns/logs/debug-internal.log +2025-05-05 19:10:35,745 INFO MainThread:20093 [wandb_init.py:init():852] calling init triggers +2025-05-05 19:10:35,745 INFO MainThread:20093 [wandb_init.py:init():857] wandb.init called with sweep_config: {} +config: {'output_dir': '/workspace/ComfyUI/models/loras/out', 'dataset': '/workspace/configs/dataset_wan.toml', 'epochs': 1000, 'micro_batch_size_per_gpu': 1, 'pipeline_stages': 1, 'gradient_accumulation_steps': 1, 'gradient_clipping': 1.0, 'warmup_steps': 40, 'activation_checkpointing': True, 'partition_method': 'parameters', 'save_dtype': torch.bfloat16, 'caching_batch_size': 1, 'steps_per_print': 1, 'video_clip_mode': 'single_beginning', 'save_every_n_epochs': 10, 'checkpoint_every_n_minutes': 120, 'blocks_to_swap': 20, 'eval_every_n_epochs': 1, 'eval_before_first_step': True, 'eval_micro_batch_size_per_gpu': 1, 'eval_gradient_accumulation_steps': 1, 'model': {'type': 'wan', 'ckpt_path': '/workspace/Wan2.1', 'transformer_path': '/workspace/ComfyUI/models/diffusion_models/wan2.1_i2v_480p_14B_bf16.safetensors', 'llm_path': '/workspace/ComfyUI/models/text_encoders/umt5-xxl-enc-bf16.safetensors', 'dtype': torch.bfloat16, 'timestep_sample_method': 'logit_normal', 'guidance': 1.0}, 'adapter': {'type': 'lora', 'rank': 32, 'dtype': torch.bfloat16, 'alpha': 32, 'dropout': 0.0}, 'optimizer': {'type': 'adamw_optimi', 'lr': 1e-05, 'betas': [0.9, 0.99], 'weight_decay': 0.01}, 'monitoring': {'enable_wandb': True, 'wandb_api_key': 'f46df1bb828b735bd22f94fff1be190ba5e046f9', 'wandb_tracker_name': 'wan-lora', 'wandb_run_name': 'wan-lora'}, 'reentrant_activation_checkpointing': False, 'logging_steps': 1, 'eval_datasets': [], 'eval_every_n_steps': None, '_wandb': {}} +2025-05-05 19:10:35,745 INFO MainThread:20093 [wandb_init.py:init():893] starting backend +2025-05-05 19:10:35,745 INFO MainThread:20093 [wandb_init.py:init():897] sending inform_init request +2025-05-05 19:10:35,748 INFO MainThread:20093 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-05-05 19:10:35,749 INFO MainThread:20093 [wandb_init.py:init():907] backend started and connected +2025-05-05 19:10:35,751 INFO MainThread:20093 [wandb_init.py:init():1002] updated telemetry +2025-05-05 19:10:35,759 INFO MainThread:20093 [wandb_init.py:init():1026] communicating run to backend with 90.0 second timeout +2025-05-05 19:10:36,112 INFO MainThread:20093 [wandb_init.py:init():1101] starting run threads in backend +2025-05-05 19:10:36,318 INFO MainThread:20093 [wandb_run.py:_console_start():2566] atexit reg +2025-05-05 19:10:36,319 INFO MainThread:20093 [wandb_run.py:_redirect():2414] redirect: wrap_raw +2025-05-05 19:10:36,319 INFO MainThread:20093 [wandb_run.py:_redirect():2483] Wrapping output streams. +2025-05-05 19:10:36,320 INFO MainThread:20093 [wandb_run.py:_redirect():2506] Redirects installed. +2025-05-05 19:10:36,324 INFO MainThread:20093 [wandb_init.py:init():1147] run started, returning control to user process diff --git a/wandb/run-20250505_191035-lg5j0rns/run-lg5j0rns.wandb b/wandb/run-20250505_191035-lg5j0rns/run-lg5j0rns.wandb new file mode 100644 index 0000000000000000000000000000000000000000..7ed6e6d1822f97e386bc6e0b0be9b36709cac581 --- /dev/null +++ b/wandb/run-20250505_191035-lg5j0rns/run-lg5j0rns.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0914c09bb26d83af3e376f141e1bc9ca7927073c6679635544dfbb6e57b8d10 +size 5079040