new config; new base datasets

Files changed (5) hide show

README.md CHANGED Viewed

@@ -96,6 +96,22 @@ CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0 PYTORCH_CUDA_ALLOC_CONF=expandable
 ```
 ```
 ```
 Backup `wandb`:

 ```
 ```
+Seed set to 23
+Time to instantiate model: 0.21 seconds.
+Total parameters: 302,023,168
+Verifying settings ...
+Measured TFLOPs: 55520.94
+Epoch 1 | iter 64 step 1 | loss train: 11.982, val: n/a | iter time: 409.55 ms (step) remaining time: 4 days, 17:45:21
+Epoch 1 | iter 128 step 2 | loss train: 11.980, val: n/a | iter time: 354.46 ms (step) remaining time: 3 days, 15:01:16
+Epoch 1 | iter 192 step 3 | loss train: 11.980, val: n/a | iter time: 353.67 ms (step) remaining time: 3 days, 5:46:03
+Epoch 1 | iter 256 step 4 | loss train: 11.980, val: n/a | iter time: 354.11 ms (step) remaining time: 3 days, 1:05:26
+Epoch 1 | iter 320 step 5 | loss train: 11.978, val: n/a | iter time: 358.28 ms (step) remaining time: 2 days, 22:21:45
+Epoch 1 | iter 384 step 6 | loss train: 11.974, val: n/a | iter time: 356.21 ms (step) remaining time: 2 days, 20:33:55
+Epoch 1 | iter 448 step 7 | loss train: 11.964, val: n/a | iter time: 357.42 ms (step) remaining time: 2 days, 19:15:59
+Epoch 1 | iter 512 step 8 | loss train: 11.956, val: n/a | iter time: 355.74 ms (step) remaining time: 2 days, 18:16:43
+Epoch 1 | iter 576 step 9 | loss train: 11.937, val: n/a | iter time: 356.05 ms (step) remaining time: 2 days, 17:28:34
+Epoch 1 | iter 640 step 10 | loss train: 11.929, val: n/a | iter time: 356.68 ms (step) remaining time: 2 days, 16:49:58
+# ...
 ```
 Backup `wandb`:

config-0.json CHANGED Viewed

@@ -6,7 +6,7 @@
   "attention_dropout": 0.0,
   "bos_token_id": 0,
   "eos_token_id": 1,
-  "head_dim": 128,
   "hidden_act": "silu",
   "hidden_size": 512,
   "initializer_range": 0.02,
@@ -14,7 +14,7 @@
   "max_position_embeddings": 131072,
   "mlp_bias": false,
   "model_type": "llama",
-  "num_attention_heads": 8,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "pretraining_tp": 1,

   "attention_dropout": 0.0,
   "bos_token_id": 0,
   "eos_token_id": 1,
+  "head_dim": 256,
   "hidden_act": "silu",
   "hidden_size": 512,
   "initializer_range": 0.02,
   "max_position_embeddings": 131072,
   "mlp_bias": false,
   "model_type": "llama",
+  "num_attention_heads": 32,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "pretraining_tp": 1,

scripts/core_base_datasets.py CHANGED Viewed

@@ -1,4 +1,18 @@
 core_base_datasets = [
     #
     # multilingual
     #
@@ -76,6 +90,15 @@ core_base_datasets = [
         for i in range(0, 100, 10)
     ],
     #
     # general knowledge
     #

 core_base_datasets = [
+    #
+    # general
+    #
+    # 3.35 GB, 1,000,000 - Curated RefinedWeb with medium context length (2048 <= ctx_len <= 8192)
+    *[
+        {'kind': 'base', 'path': 'vilm/refinedweb-1m-medium', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
+        for i in range(0, 100, 5)
+    ],
+    # 4.01 GB, 1,360,929
+    *[
+        {'kind': 'base', 'path': 'deatos/fineweb-edu-mini-combined', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
+        for i in range(0, 100, 5)
+    ],
     #
     # multilingual
     #
         for i in range(0, 100, 10)
     ],
+    #
+    # math / code
+    #
+    # 2.23 GB, 719,244
+    *[
+        {'kind': 'base', 'path': 'MathGenie/MathCode-Pile', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
+        for i in range(0, 100, 10)
+    ],
     #
     # general knowledge
     #

scripts/prepare_core_datasets.py CHANGED Viewed

@@ -13,13 +13,13 @@ tokenizer_path = '../tokenizer'
 seqs = [
     (0, 1073741824, 1025, 16000),
-    (1025, 2049, 2049, 8000),
-    (2049, 4097, 4097, 4000),
-    (4097, 8193, 8193, 2000),
-    (8193, 16385, 16385, 1000),
-    (16385, 32769, 32769, 500),
-    (32769, 65537, 65537, 250),
-    (65537, 131073, 131073, 125),
 ]
 #

 seqs = [
     (0, 1073741824, 1025, 16000),
+    # (1025, 2049, 2049, 8000),
+    # (2049, 4097, 4097, 4000),
+    # (4097, 8193, 8193, 2000),
+    # (8193, 16385, 16385, 1000),
+    # (16385, 32769, 32769, 500),
+    # (32769, 65537, 65537, 250),
+    # (65537, 131073, 131073, 125),
 ]
 #

scripts/pretrain_core_model_0.yaml CHANGED Viewed

@@ -10,7 +10,7 @@ model_config:
   vocab_size: 131072
   padded_vocab_size: 131072
   n_layer: 32
-  n_head: 8
   n_embd: 512
   n_query_groups: 8
   rotary_percentage: 1.0
@@ -21,7 +21,7 @@ model_config:
   intermediate_size: 2048 # n_embd * 4
   norm_eps: 1e-5
   rope_base: 4300 # https://arxiv.org/pdf/2405.14591
-  head_size: 128 # n_embd / n_head
 # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
 # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
@@ -85,7 +85,7 @@ train:
   max_norm: 1.0
   #   (type: float, default: 4e-05)
-  min_lr: 3e-5
 # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 eval:
@@ -105,17 +105,17 @@ eval:
   final_validation: true
 # Optimizer-related arguments
-optimizer:
-  class_path: torch.optim.AdamW
-  init_args:
-    # (type: float, default: 0.001)
-    lr: 3e-4
-    # (type: float, default: 0.01)
-    weight_decay: 0.01
-    # (type: tuple, default: (0.9,0.999))
-    betas:
-      - 0.9
-      - 0.999
 # optimizer:
 #   class_path: sophia_opt.SophiaG
@@ -127,6 +127,16 @@ optimizer:
 #     rho: 0.05
 #     weight_decay: 0.1
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto

   vocab_size: 131072
   padded_vocab_size: 131072
   n_layer: 32
+  n_head: 32
   n_embd: 512
   n_query_groups: 8
   rotary_percentage: 1.0
   intermediate_size: 2048 # n_embd * 4
   norm_eps: 1e-5
   rope_base: 4300 # https://arxiv.org/pdf/2405.14591
+  head_size: 256 # n_embd / n_head
 # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
 # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
   max_norm: 1.0
   #   (type: float, default: 4e-05)
+  min_lr: 1e-5
 # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
 eval:
   final_validation: true
 # Optimizer-related arguments
+# optimizer:
+#   class_path: torch.optim.AdamW
+#   init_args:
+#     # (type: float, default: 0.001)
+#     lr: 3e-4
+#     # (type: float, default: 0.01)
+#     weight_decay: 0.01
+#     # (type: tuple, default: (0.9,0.999))
+#     betas:
+#       - 0.9
+#       - 0.999
 # optimizer:
 #   class_path: sophia_opt.SophiaG
 #     rho: 0.05
 #     weight_decay: 0.1
+optimizer:
+  class_path: sophia_opt.SophiaG
+  init_args:
+    lr: 1e-4
+    betas:
+      - 0.965
+      - 0.99
+    rho: 0.04
+    weight_decay: 1e-1
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto