tangledgroup
/

tangled-alpha-0.12-core

@@ -49,14 +49,34 @@ tags:
 ![logo](./misc/logo.jpg)
 ```bash
-time python -B prepare_core_datasets.py
 ```
 ```
 ```
 ```bash
-CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_0.yaml
 ```
 ```
@@ -65,54 +85,54 @@ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable
 Backup `wandb`:
 ```bash
-mv wandb wandb-pretrain-core-0
 ```
 Copy config:
 ```bash
-cp ../config-0.json ../out/pretrain-core-0/final/config.json
 ```
 Chat with model:
 ```bash
-CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat ../out/pretrain-core-0/final
 ```
 ```bash
-CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True time litgpt evaluate --tasks 'leaderboard' --out_dir '../evaluate/pretrain-core-0/leaderboard/' --batch_size '4' --dtype 'bfloat16' '../out/pretrain-core-0/final'
 ```
 ```
 ```
 ```bash
-litgpt convert_pretrained_checkpoint ../out/pretrain-core-0/final ../out/pretrain-core-0/checkpoint
 ```
 ```bash
-CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_1.yaml
 ```
 ```bash
-litgpt convert_pretrained_checkpoint ../out/pretrain-core-1/final ../out/pretrain-core-1/checkpoint
 ```
 ```bash
-CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_2.yaml
 ```
 ```bash
-litgpt convert_pretrained_checkpoint ../out/pretrain-core-2/final ../out/pretrain-core-2/checkpoint
 ```
 ```bash
-CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_core_model_3.yaml
 ```
 ```bash
-CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True time litgpt evaluate --tasks 'leaderboard' --out_dir '../evaluate/pretrain-core-3/leaderboard/' --batch_size '4' --dtype 'bfloat16' '../out/pretrain-core-3/final'
 ```
 ```

 ![logo](./misc/logo.jpg)
 ```bash
+time python -B prepare_base_datasets.py
 ```
 ```
+i=0, min_len=0, max_len=1073741824, block_size=8193, chunk_size=16386000, len(dataset)=1496631, len(dataset) * block_size=12261897783
+Total number of tokens in the optimized dataset '../base-data-0-0-1073741824-8193-2000' is 12261897783
+i=1, min_len=8193, max_len=16385, block_size=16385, chunk_size=16385000, len(dataset)=78802, len(dataset) * block_size=1291170770
+Total number of tokens in the optimized dataset '../base-data-1-8193-16385-16385-1000' is 1291170770
+i=2, min_len=16385, max_len=32769, block_size=32769, chunk_size=16384500, len(dataset)=23511, len(dataset) * block_size=770431959
+Total number of tokens in the optimized dataset '../base-data-2-16385-32769-32769-500' is 770431959
+i=3, min_len=32769, max_len=65537, block_size=65537, chunk_size=16384250, len(dataset)=5128, len(dataset) * block_size=336073736
+Total number of tokens in the optimized dataset '../base-data-3-32769-65537-65537-250' is 336073736
+i=4, min_len=65537, max_len=131073, block_size=131073, chunk_size=16384125, len(dataset)=1169, len(dataset) * block_size=153224337
+Total number of tokens in the optimized dataset '../base-data-4-65537-131073-131073-125' is 153224337
+46G     ../base-data-0-0-1073741824-8193-2000
+4.9G    ../base-data-1-8193-16385-16385-1000
+2.9G    ../base-data-2-16385-32769-32769-500
+1.3G    ../base-data-3-32769-65537-65537-250
+589M    ../base-data-4-65537-131073-131073-125
 ```
 ```bash
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_base_model_0.yaml
 ```
 ```
 Backup `wandb`:
 ```bash
+mv wandb wandb-pretrain-base-0
 ```
 Copy config:
 ```bash
+cp ../config-0.json ../out/pretrain-base-0/final/config.json
 ```
 Chat with model:
 ```bash
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt chat ../out/pretrain-base-0/final
 ```
 ```bash
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True time litgpt evaluate --tasks 'leaderboard' --out_dir '../evaluate/pretrain-base-0/leaderboard/' --batch_size '4' --dtype 'bfloat16' '../out/pretrain-base-0/final'
 ```
 ```
 ```
 ```bash
+litgpt convert_pretrained_checkpoint ../out/pretrain-base-0/final ../out/pretrain-base-0/checkpoint
 ```
 ```bash
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_base_model_1.yaml
 ```
 ```bash
+litgpt convert_pretrained_checkpoint ../out/pretrain-base-1/final ../out/pretrain-base-1/checkpoint
 ```
 ```bash
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_base_model_2.yaml
 ```
 ```bash
+litgpt convert_pretrained_checkpoint ../out/pretrain-base-2/final ../out/pretrain-base-2/checkpoint
 ```
 ```bash
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True litgpt pretrain --config pretrain_base_model_3.yaml
 ```
 ```bash
+CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True time litgpt evaluate --tasks 'leaderboard' --out_dir '../evaluate/pretrain-base-3/leaderboard/' --batch_size '4' --dtype 'bfloat16' '../out/pretrain-base-3/final'
 ```
 ```

scripts/pretrain_base_model_0.yaml CHANGED Viewed

@@ -58,19 +58,19 @@ train:
   log_interval: 1
   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
-  global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
-  micro_batch_size: 1
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
-  lr_warmup_steps: 2000
   # Number of epochs to train on (type: Optional[int], default: null)
   epochs:
   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
-  max_tokens: 32706456
   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
   max_steps:

   log_interval: 1
   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
+  global_batch_size: 64
   # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 4
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
+  lr_warmup_steps: 100
   # Number of epochs to train on (type: Optional[int], default: null)
   epochs:
   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
+  max_tokens: 12261897783
   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
   max_steps: