tangledgroup
/

tangled-alpha-0.10-core

@@ -21,7 +21,7 @@
   "rms_norm_eps": 1e-05,
   "rope_scaling": null,
   "rope_theta": 4300.0,
-  "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.45.0.dev0",
   "use_cache": true,

   "rms_norm_eps": 1e-05,
   "rope_scaling": null,
   "rope_theta": 4300.0,
+  "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.45.0.dev0",
   "use_cache": true,

scripts/pretrain_core_model_0.yaml CHANGED Viewed

@@ -61,7 +61,6 @@ train:
   global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
-  # micro_batch_size: 2
   micro_batch_size: 8
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
@@ -77,11 +76,10 @@ train:
   max_steps:
   # Limits the length of samples. Off by default (type: Optional[int], default: null)
-  # max_seq_length: 4096
   max_seq_length: 1024
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
-  tie_embeddings: true
   #   (type: Optional[float], default: 1.0)
   max_norm: 1.0
@@ -107,22 +105,17 @@ eval:
   final_validation: true
 # Optimizer-related arguments
-# optimizer:
-#   class_path: torch.optim.AdamW
-#   # class_path: torchao.prototype.low_bit_optim.AdamW8bit
-#   # class_path: torchao.prototype.low_bit_optim.AdamW4bit
-#   # class_path: bitsandbytes.optim.AdamW8bit
-#   # class_path: bitsandbytes.optim.PagedAdamW8bit
-#   init_args:
-#     # (type: float, default: 0.001)
-#     lr: 3e-4
-#     # (type: float, default: 0.01)
-#     weight_decay: 0.01
-#     # (type: tuple, default: (0.9,0.999))
-#     betas:
-#       - 0.9
-#       - 0.999
 # optimizer:
 #   class_path: sophia_opt.SophiaG
@@ -134,11 +127,6 @@ eval:
 #     rho: 0.05
 #     weight_decay: 0.1
-optimizer:
-  class_path: dolphinflow.DolphinFlow
-  init_args:
-      lr: 3e-4
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto

   global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
   micro_batch_size: 8
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   max_steps:
   # Limits the length of samples. Off by default (type: Optional[int], default: null)
   max_seq_length: 1024
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
+  tie_embeddings: false
   #   (type: Optional[float], default: 1.0)
   max_norm: 1.0
   final_validation: true
 # Optimizer-related arguments
+optimizer:
+  class_path: torch.optim.AdamW
+  init_args:
+    # (type: float, default: 0.001)
+    lr: 3e-4
+    # (type: float, default: 0.01)
+    weight_decay: 0.01
+    # (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.999
 # optimizer:
 #   class_path: sophia_opt.SophiaG
 #     rho: 0.05
 #     weight_decay: 0.1
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto