|
NEW RUN 2023-12-10-16-53-12 |
|
{'load_model': '/content/RWKV-LM-LoRA/RWKV-v4neo/RWKV-4-7B-world-one-novel-tuned-65k-LoRA-rwkv-12.pth', 'wandb': '', 'proj_dir': '/content/RWKV-LM-LoRA//lora/', 'random_seed': -1, 'data_file': '/content/RWKV-LM-LoRA/RWKV-v4neo/data_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 1024, 'epoch_steps': 2000, 'epoch_count': 200, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 2, 'n_layer': 32, 'n_embd': 4096, 'dim_att': 4096, 'dim_ffn': 16384, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 5e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 128, 'lora_alpha': 128.0, 'lora_dropout': 0.01, 'lora_parts': 'att,ln,time', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': 4, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-12-10-16-53-12', 'betas': (0.9, 0.999), 'real_bsz': 2, 'run_name': '65536 ctx1024 L32 D4096'} |
|
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 4, 'train_micro_batch_size_per_gpu': 2, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} |
|
NEW RUN 2023-12-10-16-53-12 |
|
{'load_model': '/content/RWKV-LM-LoRA/RWKV-v4neo/RWKV-4-7B-world-one-novel-tuned-65k-LoRA-rwkv-12.pth', 'wandb': '', 'proj_dir': '/content/RWKV-LM-LoRA//lora/', 'random_seed': -1, 'data_file': '/content/RWKV-LM-LoRA/RWKV-v4neo/data_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 1024, 'epoch_steps': 2000, 'epoch_count': 200, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 2, 'n_layer': 32, 'n_embd': 4096, 'dim_att': 4096, 'dim_ffn': 16384, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 5e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 128, 'lora_alpha': 128.0, 'lora_dropout': 0.01, 'lora_parts': 'att,ln,time', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': 4, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-12-10-16-53-12', 'betas': (0.9, 0.999), 'real_bsz': 2, 'run_name': '65536 ctx1024 L32 D4096'} |
|
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 4, 'train_micro_batch_size_per_gpu': 2, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} |
|
NEW RUN 2023-12-10-16-53-12 |
|
{'load_model': '/content/RWKV-LM-LoRA/RWKV-v4neo/RWKV-4-7B-world-one-novel-tuned-65k-LoRA-rwkv-12.pth', 'wandb': '', 'proj_dir': '/content/RWKV-LM-LoRA//lora/', 'random_seed': -1, 'data_file': '/content/RWKV-LM-LoRA/RWKV-v4neo/data_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 1024, 'epoch_steps': 2000, 'epoch_count': 200, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 2, 'n_layer': 32, 'n_embd': 4096, 'dim_att': 4096, 'dim_ffn': 16384, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 5e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 128, 'lora_alpha': 128.0, 'lora_dropout': 0.01, 'lora_parts': 'att,ln,time', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': 4, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-12-10-16-53-12', 'betas': (0.9, 0.999), 'real_bsz': 2, 'run_name': '65536 ctx1024 L32 D4096'} |
|
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 4, 'train_micro_batch_size_per_gpu': 2, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} |
|
NEW RUN 2023-12-10-16-53-12 |
|
{'load_model': '/content/RWKV-LM-LoRA/RWKV-v4neo/RWKV-4-7B-world-one-novel-tuned-65k-LoRA-rwkv-12.pth', 'wandb': '', 'proj_dir': '/content/RWKV-LM-LoRA//lora/', 'random_seed': -1, 'data_file': '/content/RWKV-LM-LoRA/RWKV-v4neo/data_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 1024, 'epoch_steps': 2000, 'epoch_count': 200, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 2, 'n_layer': 32, 'n_embd': 4096, 'dim_att': 4096, 'dim_ffn': 16384, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 5e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 128, 'lora_alpha': 128.0, 'lora_dropout': 0.01, 'lora_parts': 'att,ln,time', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': 4, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-12-10-16-53-12', 'betas': (0.9, 0.999), 'real_bsz': 2, 'run_name': '65536 ctx1024 L32 D4096'} |
|
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 4, 'train_micro_batch_size_per_gpu': 2, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} |
|
0 2.196787 8.9961 0.00004990 2023-12-10 17:09:27.011339 0 |
|
1 2.177770 8.8266 0.00004980 2023-12-10 17:23:17.475495 1 |
|
2 2.181141 8.8564 0.00004970 2023-12-10 17:37:08.332684 2 |
|
3 2.166461 8.7273 0.00004960 2023-12-10 17:50:58.941680 3 |
|
4 2.170504 8.7627 0.00004950 2023-12-10 18:04:49.591724 4 |
|
5 2.164547 8.7107 0.00004940 2023-12-10 18:18:40.549475 5 |
|
6 2.162645 8.6941 0.00004930 2023-12-10 18:32:31.672345 6 |
|
7 2.163926 8.7052 0.00004920 2023-12-10 18:46:22.697187 7 |
|
8 2.161668 8.6856 0.00004910 2023-12-10 19:00:13.362314 8 |
|
9 2.160648 8.6768 0.00004900 2023-12-10 19:14:04.227370 9 |
|
10 2.158289 8.6563 0.00004891 2023-12-10 19:27:55.412297 10 |
|
11 2.159008 8.6625 0.00004881 2023-12-10 19:41:46.268648 11 |
|
12 2.157488 8.6494 0.00004871 2023-12-10 19:55:37.045079 12 |
|
13 2.154871 8.6268 0.00004861 2023-12-10 20:09:28.171833 13 |
|
14 2.149664 8.5820 0.00004851 2023-12-10 20:23:18.914219 14 |
|
15 2.142840 8.5236 0.00004842 2023-12-10 20:37:09.509170 15 |
|
16 2.150937 8.5929 0.00004832 2023-12-10 20:51:00.255558 16 |
|
17 2.141043 8.5083 0.00004822 2023-12-10 21:04:51.281992 17 |
|
18 2.141375 8.5111 0.00004812 2023-12-10 21:18:42.049462 18 |
|
19 2.142992 8.5249 0.00004803 2023-12-10 21:32:32.779415 19 |
|
20 2.142758 8.5229 0.00004793 2023-12-10 21:46:23.581201 20 |
|
21 2.139707 8.4969 0.00004784 2023-12-10 22:00:14.843404 21 |
|
22 2.134492 8.4528 0.00004774 2023-12-10 22:14:05.605972 22 |
|
23 2.140691 8.5053 0.00004764 2023-12-10 22:27:56.736756 23 |
|
|