aflah commited on
Commit
50732c4
·
verified ·
1 Parent(s): e459a6e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +56 -0
  2. model_name=0--step=1199-consumed_samples=614400.0/weights/__1_1.distcp +3 -0
  3. model_name=0--step=1199-consumed_samples=614400.0/weights/__3_0.distcp +3 -0
  4. model_name=0--step=1199-consumed_samples=614400.0/weights/__4_1.distcp +3 -0
  5. model_name=0--step=1199-consumed_samples=614400.0/weights/__7_1.distcp +3 -0
  6. model_name=0--step=1274-consumed_samples=652800.0-last/context/879f2755-e403-4434-84bd-93beb6106877 +0 -0
  7. model_name=0--step=1274-consumed_samples=652800.0-last/context/964a8138-0ed9-4e86-94a3-22b30e4b6906 +0 -0
  8. model_name=0--step=1274-consumed_samples=652800.0-last/context/cf37c7b6-77c3-44d0-905c-5082b4d0580a +0 -0
  9. model_name=0--step=1274-consumed_samples=652800.0-last/context/io.json +1 -0
  10. model_name=0--step=1274-consumed_samples=652800.0-last/context/model.yaml +266 -0
  11. model_name=0--step=1274-consumed_samples=652800.0-last/context/tokenizer_config.json +238 -0
  12. model_name=0--step=1274-consumed_samples=652800.0-last/weights/common.pt +3 -0
  13. model_name=0--step=1299-consumed_samples=665600.0/weights/__1_0.distcp +3 -0
  14. model_name=0--step=1299-consumed_samples=665600.0/weights/__2_0.distcp +3 -0
  15. model_name=0--step=1299-consumed_samples=665600.0/weights/__4_0.distcp +3 -0
  16. model_name=0--step=1299-consumed_samples=665600.0/weights/__5_1.distcp +3 -0
  17. model_name=0--step=399-consumed_samples=204800.0/weights/.metadata +3 -0
  18. model_name=0--step=399-consumed_samples=204800.0/weights/common.pt +3 -0
  19. model_name=0--step=399-consumed_samples=204800.0/weights/metadata.json +1 -0
  20. model_name=0--step=699-consumed_samples=358400.0/context/030e253d-d59d-444c-88ce-d1cc0887e916 +0 -0
  21. model_name=0--step=699-consumed_samples=358400.0/context/dbaa4557-3c32-4811-8655-c6eedf50a52e +0 -0
  22. model_name=0--step=699-consumed_samples=358400.0/context/fc700484-6697-44f7-8535-3c24e044c2d2 +0 -0
  23. model_name=0--step=699-consumed_samples=358400.0/context/model.yaml +266 -0
  24. model_name=0--step=699-consumed_samples=358400.0/context/tokenizer_config.json +238 -0
  25. model_name=0--step=799-consumed_samples=409600.0/context/3a9305c4-7453-4cad-a094-f9f6012b5392 +0 -0
  26. model_name=0--step=799-consumed_samples=409600.0/context/8d43aadf-2f71-4c43-864a-951b87890162 +0 -0
  27. model_name=0--step=799-consumed_samples=409600.0/context/fa28f4cf-dc94-495c-900b-a556df2fe4c0 +0 -0
  28. model_name=0--step=799-consumed_samples=409600.0/context/io.json +1 -0
  29. model_name=0--step=799-consumed_samples=409600.0/context/model.yaml +266 -0
  30. model_name=0--step=799-consumed_samples=409600.0/context/tokenizer_config.json +238 -0
  31. model_name=0--step=799-consumed_samples=409600.0/weights/.metadata +3 -0
  32. model_name=0--step=799-consumed_samples=409600.0/weights/__3_1.distcp +3 -0
  33. model_name=0--step=799-consumed_samples=409600.0/weights/__7_0.distcp +3 -0
  34. model_name=0--step=799-consumed_samples=409600.0/weights/common.pt +3 -0
  35. model_name=0--step=799-consumed_samples=409600.0/weights/metadata.json +1 -0
  36. model_name=0--step=899-consumed_samples=460800.0/weights/__0_0.distcp +3 -0
  37. model_name=0--step=899-consumed_samples=460800.0/weights/__0_1.distcp +3 -0
  38. model_name=0--step=899-consumed_samples=460800.0/weights/__1_0.distcp +3 -0
  39. model_name=0--step=899-consumed_samples=460800.0/weights/__1_1.distcp +3 -0
  40. model_name=0--step=899-consumed_samples=460800.0/weights/__2_0.distcp +3 -0
  41. model_name=0--step=899-consumed_samples=460800.0/weights/__2_1.distcp +3 -0
  42. model_name=0--step=899-consumed_samples=460800.0/weights/__3_0.distcp +3 -0
  43. model_name=0--step=899-consumed_samples=460800.0/weights/__3_1.distcp +3 -0
  44. model_name=0--step=899-consumed_samples=460800.0/weights/__4_0.distcp +3 -0
  45. model_name=0--step=899-consumed_samples=460800.0/weights/__4_1.distcp +3 -0
  46. model_name=0--step=899-consumed_samples=460800.0/weights/__5_0.distcp +3 -0
  47. model_name=0--step=899-consumed_samples=460800.0/weights/__5_1.distcp +3 -0
  48. model_name=0--step=899-consumed_samples=460800.0/weights/__6_0.distcp +3 -0
  49. model_name=0--step=899-consumed_samples=460800.0/weights/__6_1.distcp +3 -0
  50. model_name=0--step=899-consumed_samples=460800.0/weights/__7_0.distcp +3 -0
.gitattributes CHANGED
@@ -105,3 +105,59 @@ model_name=0--step=1299-consumed_samples=665600.0/weights/__6_1.distcp filter=lf
105
  model_name=0--step=1299-consumed_samples=665600.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
106
  model_name=0--step=1299-consumed_samples=665600.0/weights/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
107
  model_name=0--step=1299-consumed_samples=665600.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  model_name=0--step=1299-consumed_samples=665600.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
106
  model_name=0--step=1299-consumed_samples=665600.0/weights/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
107
  model_name=0--step=1299-consumed_samples=665600.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
108
+ model_name=0--step=1199-consumed_samples=614400.0/weights/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
109
+ model_name=0--step=1199-consumed_samples=614400.0/weights/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
110
+ model_name=0--step=1199-consumed_samples=614400.0/weights/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
111
+ model_name=0--step=1299-consumed_samples=665600.0/weights/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
112
+ model_name=0--step=1299-consumed_samples=665600.0/weights/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
113
+ model_name=0--step=1299-consumed_samples=665600.0/weights/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
114
+ model_name=0--step=1299-consumed_samples=665600.0/weights/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
115
+ model_name=0--step=1199-consumed_samples=614400.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
116
+ model_name=0--step=99-consumed_samples=51200.0/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
117
+ model_name=0--step=99-consumed_samples=51200.0/weights/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
118
+ model_name=0--step=99-consumed_samples=51200.0/weights/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
119
+ model_name=0--step=99-consumed_samples=51200.0/weights/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
120
+ model_name=0--step=99-consumed_samples=51200.0/weights/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
121
+ model_name=0--step=99-consumed_samples=51200.0/weights/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
122
+ model_name=0--step=99-consumed_samples=51200.0/weights/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
123
+ model_name=0--step=99-consumed_samples=51200.0/weights/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
124
+ model_name=0--step=99-consumed_samples=51200.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
125
+ model_name=0--step=99-consumed_samples=51200.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
126
+ model_name=0--step=99-consumed_samples=51200.0/weights/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
127
+ model_name=0--step=999-consumed_samples=512000.0/weights/.metadata filter=lfs diff=lfs merge=lfs -text
128
+ model_name=0--step=99-consumed_samples=51200.0/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
129
+ model_name=0--step=99-consumed_samples=51200.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
130
+ model_name=0--step=399-consumed_samples=204800.0/weights/.metadata filter=lfs diff=lfs merge=lfs -text
131
+ model_name=0--step=99-consumed_samples=51200.0/weights/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
132
+ model_name=0--step=899-consumed_samples=460800.0/weights/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
133
+ model_name=0--step=99-consumed_samples=51200.0/weights/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
134
+ model_name=0--step=99-consumed_samples=51200.0/weights/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
135
+ model_name=0--step=899-consumed_samples=460800.0/weights/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
136
+ model_name=0--step=899-consumed_samples=460800.0/weights/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
137
+ model_name=0--step=899-consumed_samples=460800.0/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
138
+ model_name=0--step=899-consumed_samples=460800.0/weights/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
139
+ model_name=0--step=899-consumed_samples=460800.0/weights/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
140
+ model_name=0--step=899-consumed_samples=460800.0/weights/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
141
+ model_name=0--step=899-consumed_samples=460800.0/weights/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
142
+ model_name=0--step=899-consumed_samples=460800.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
143
+ model_name=0--step=899-consumed_samples=460800.0/weights/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
144
+ model_name=0--step=899-consumed_samples=460800.0/weights/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
145
+ model_name=0--step=899-consumed_samples=460800.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
146
+ model_name=0--step=899-consumed_samples=460800.0/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
147
+ model_name=0--step=999-consumed_samples=512000.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
148
+ model_name=0--step=999-consumed_samples=512000.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
149
+ model_name=0--step=899-consumed_samples=460800.0/weights/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
150
+ model_name=0--step=799-consumed_samples=409600.0/weights/.metadata filter=lfs diff=lfs merge=lfs -text
151
+ model_name=0--step=999-consumed_samples=512000.0/weights/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
152
+ model_name=0--step=999-consumed_samples=512000.0/weights/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
153
+ model_name=0--step=899-consumed_samples=460800.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
154
+ model_name=0--step=999-consumed_samples=512000.0/weights/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
155
+ model_name=0--step=899-consumed_samples=460800.0/weights/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
156
+ model_name=0--step=999-consumed_samples=512000.0/weights/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
157
+ model_name=0--step=999-consumed_samples=512000.0/weights/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
158
+ model_name=0--step=999-consumed_samples=512000.0/weights/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
159
+ model_name=0--step=999-consumed_samples=512000.0/weights/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
160
+ model_name=0--step=999-consumed_samples=512000.0/weights/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
161
+ model_name=0--step=999-consumed_samples=512000.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
162
+ model_name=0--step=799-consumed_samples=409600.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
163
+ model_name=0--step=799-consumed_samples=409600.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
model_name=0--step=1199-consumed_samples=614400.0/weights/__1_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ae04f0e23098ee5765f4bc9eb4f69c20a27bad38a2c6255396be2c9666d29c9
3
+ size 943962344
model_name=0--step=1199-consumed_samples=614400.0/weights/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddc22f05107252347bb8d64af0daf05a11b0a89f65520df7446767a060cd5f48
3
+ size 943054924
model_name=0--step=1199-consumed_samples=614400.0/weights/__4_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3360cd3c5eafa2749be01c26e1efe7965614e5269e7d3558018682eb2fe77768
3
+ size 944985984
model_name=0--step=1199-consumed_samples=614400.0/weights/__7_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1a4193cb55312f8064fbab7a69bb35b0c16bc33c12a26f18f1eee9a1fcbaabf
3
+ size 943936384
model_name=0--step=1274-consumed_samples=652800.0-last/context/879f2755-e403-4434-84bd-93beb6106877 ADDED
Binary file (173 Bytes). View file
 
model_name=0--step=1274-consumed_samples=652800.0-last/context/964a8138-0ed9-4e86-94a3-22b30e4b6906 ADDED
Binary file (584 Bytes). View file
 
model_name=0--step=1274-consumed_samples=652800.0-last/context/cf37c7b6-77c3-44d0-905c-5082b4d0580a ADDED
Binary file (202 Bytes). View file
 
model_name=0--step=1274-consumed_samples=652800.0-last/context/io.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"root": {"type": "ref", "key": "trainer_context_1"}, "objects": {"tuple_1": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "sequence_parallel"], ["Index(index=4)", "context_parallel_size"], ["Index(index=5)", "expert_model_parallel_size"], ["Index(index=6)", "expert_tensor_parallel_size"], ["Index(index=7)", "moe_extended_tp"], ["Index(index=8)", "bf16"], ["Index(index=9)", "params_dtype"], ["Index(index=10)", "autocast_dtype"], ["Index(index=11)", "use_te_rng_tracker"], ["Index(index=12)", "pipeline_dtype"], ["Index(index=13)", "microbatch_group_size_per_vp_stage"], ["Index(index=14)", "account_for_embedding_in_pipeline_split"], ["Index(index=15)", "account_for_loss_in_pipeline_split"], ["Index(index=16)", "share_embeddings_and_output_weights"], ["Index(index=17)", "seq_length"]], "metadata": null}, "dict_1": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_2": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "Llama32Config1B"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_1"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_1"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_2"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama32_config1_b_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.virtual_pipeline_model_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.sequence_parallel"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.context_parallel_size"]}], ["Attr(name='expert_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.expert_model_parallel_size"]}], ["Attr(name='expert_tensor_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.expert_tensor_parallel_size"]}], ["Attr(name='moe_extended_tp')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.moe_extended_tp"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='autocast_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='use_te_rng_tracker')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.use_te_rng_tracker"]}], ["Attr(name='pipeline_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='microbatch_group_size_per_vp_stage')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.microbatch_group_size_per_vp_stage"]}], ["Attr(name='account_for_embedding_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_embedding_in_pipeline_split"]}], ["Attr(name='account_for_loss_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_loss_in_pipeline_split"]}], ["Attr(name='share_embeddings_and_output_weights')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.share_embeddings_and_output_weights"]}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.model.config.seq_length"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_1"}, "paths": ["<root>.model.config"]}, "tuple_2": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "optimizer"], ["Index(index=1)", "lr"], ["Index(index=2)", "min_lr"], ["Index(index=3)", "decoupled_lr"], ["Index(index=4)", "decoupled_min_lr"], ["Index(index=5)", "weight_decay"], ["Index(index=6)", "fp16"], ["Index(index=7)", "bf16"], ["Index(index=8)", "params_dtype"], ["Index(index=9)", "use_precision_aware_optimizer"], ["Index(index=10)", "main_grads_dtype"], ["Index(index=11)", "main_params_dtype"], ["Index(index=12)", "exp_avg_dtype"], ["Index(index=13)", "exp_avg_sq_dtype"], ["Index(index=14)", "loss_scale"], ["Index(index=15)", "initial_loss_scale"], ["Index(index=16)", "min_loss_scale"], ["Index(index=17)", "loss_scale_window"], ["Index(index=18)", "hysteresis"], ["Index(index=19)", "adam_beta1"], ["Index(index=20)", "adam_beta2"], ["Index(index=21)", "adam_eps"], ["Index(index=22)", "sgd_momentum"], ["Index(index=23)", "use_distributed_optimizer"], ["Index(index=24)", "overlap_param_gather_with_optimizer_step"], ["Index(index=25)", "clip_grad"], ["Index(index=26)", "log_num_zeros_in_grad"], ["Index(index=27)", "barrier_with_L1_time"], ["Index(index=28)", "timers"], ["Index(index=29)", "config_logger_dir"]], "metadata": null}, "dict_3": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_4": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.optimizer.optimizer_config", "name": "OptimizerConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_2"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_3"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_4"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "optimizer_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='optimizer')", {"type": "leaf", "value": "adam", "paths": ["<root>.model.optim.config.optimizer"]}], ["Attr(name='lr')", {"type": "leaf", "value": 0.0003, "paths": ["<root>.model.optim.config.lr"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.min_lr"]}], ["Attr(name='decoupled_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_lr"]}], ["Attr(name='decoupled_min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_min_lr"]}], ["Attr(name='weight_decay')", {"type": "leaf", "value": 0.1, "paths": ["<root>.model.optim.config.weight_decay"]}], ["Attr(name='fp16')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.fp16"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='use_precision_aware_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.use_precision_aware_optimizer"]}], ["Attr(name='main_grads_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='main_params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_sq_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='loss_scale')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.loss_scale"]}], ["Attr(name='initial_loss_scale')", {"type": "leaf", "value": 4294967296, "paths": ["<root>.model.optim.config.initial_loss_scale"]}], ["Attr(name='min_loss_scale')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.min_loss_scale"]}], ["Attr(name='loss_scale_window')", {"type": "leaf", "value": 1000, "paths": ["<root>.model.optim.config.loss_scale_window"]}], ["Attr(name='hysteresis')", {"type": "leaf", "value": 2, "paths": ["<root>.model.optim.config.hysteresis"]}], ["Attr(name='adam_beta1')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.adam_beta1"]}], ["Attr(name='adam_beta2')", {"type": "leaf", "value": 0.95, "paths": ["<root>.model.optim.config.adam_beta2"]}], ["Attr(name='adam_eps')", {"type": "leaf", "value": 1e-05, "paths": ["<root>.model.optim.config.adam_eps"]}], ["Attr(name='sgd_momentum')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.sgd_momentum"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.use_distributed_optimizer"]}], ["Attr(name='overlap_param_gather_with_optimizer_step')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.overlap_param_gather_with_optimizer_step"]}], ["Attr(name='clip_grad')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.clip_grad"]}], ["Attr(name='log_num_zeros_in_grad')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.log_num_zeros_in_grad"]}], ["Attr(name='barrier_with_L1_time')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.barrier_with_L1_time"]}], ["Attr(name='timers')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.timers"]}], ["Attr(name='config_logger_dir')", {"type": "leaf", "value": "", "paths": ["<root>.model.optim.config.config_logger_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_2"}, "paths": ["<root>.model.optim.config"]}, "tuple_3": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "warmup_steps"], ["Index(index=1)", "constant_steps"], ["Index(index=2)", "min_lr"]], "metadata": null}, "dict_5": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_6": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_3": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.lr_scheduler", "name": "CosineAnnealingScheduler"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_3"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_5"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_6"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "cosine_annealing_scheduler_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='warmup_steps')", {"type": "leaf", "value": 2000, "paths": ["<root>.model.optim.lr_scheduler.warmup_steps"]}], ["Attr(name='constant_steps')", {"type": "leaf", "value": 0, "paths": ["<root>.model.optim.lr_scheduler.constant_steps"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": 2.9999999999999997e-05, "paths": ["<root>.model.optim.lr_scheduler.min_lr"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_3"}, "paths": ["<root>.model.optim.lr_scheduler"]}, "tuple_4": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "lr_scheduler"]], "metadata": null}, "dict_7": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_8": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_4": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.megatron", "name": "MegatronOptimizerModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_4"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_7"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_8"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_optimizer_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "optimizer_config_1"}], ["Attr(name='lr_scheduler')", {"type": "ref", "key": "cosine_annealing_scheduler_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_4"}, "paths": ["<root>.model.optim"]}, "tuple_5": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "attr"], ["Index(index=1)", "skip"]], "metadata": null}, "dict_9": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_10": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_5": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.artifact.file", "name": "DirOrStringArtifact"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_5"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_9"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_10"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "dir_or_string_artifact_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='attr')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.model.tokenizer.pretrained_model_name.attr"]}], ["Attr(name='skip')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.pretrained_model_name.skip"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_5"}, "paths": ["<root>.model.tokenizer.pretrained_model_name"]}, "tuple_6": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_11": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_12": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_6": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_6"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_11"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_12"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "ref", "key": "dir_or_string_artifact_1"}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "tokenizer_config.json", "paths": ["<root>.model.tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_6"}, "paths": ["<root>.model.tokenizer"]}, "tuple_7": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "optim"], ["Index(index=2)", "tokenizer"]], "metadata": null}, "dict_13": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_14": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_7": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "LlamaModel"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_7"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_13"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_14"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama_model_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "llama32_config1_b_1"}], ["Attr(name='optim')", {"type": "ref", "key": "megatron_optimizer_module_1"}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_7"}, "paths": ["<root>.model"]}, "tuple_8": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "grad_reduce_in_fp32"], ["Index(index=1)", "overlap_grad_reduce"], ["Index(index=2)", "overlap_param_gather"], ["Index(index=3)", "align_param_gather"], ["Index(index=4)", "use_distributed_optimizer"], ["Index(index=5)", "num_distributed_optimizer_instances"], ["Index(index=6)", "check_for_nan_in_grad"], ["Index(index=7)", "bucket_size"], ["Index(index=8)", "average_in_collective"], ["Index(index=9)", "fp8_param_gather"]], "metadata": null}, "dict_15": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_16": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_8": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.distributed.distributed_data_parallel_config", "name": "DistributedDataParallelConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_8"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_15"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_16"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "distributed_data_parallel_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='grad_reduce_in_fp32')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.grad_reduce_in_fp32"]}], ["Attr(name='overlap_grad_reduce')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_grad_reduce"]}], ["Attr(name='overlap_param_gather')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_param_gather"]}], ["Attr(name='align_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.align_param_gather"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.use_distributed_optimizer"]}], ["Attr(name='num_distributed_optimizer_instances')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.ddp.num_distributed_optimizer_instances"]}], ["Attr(name='check_for_nan_in_grad')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.check_for_nan_in_grad"]}], ["Attr(name='bucket_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.ddp.bucket_size"]}], ["Attr(name='average_in_collective')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.average_in_collective"]}], ["Attr(name='fp8_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.fp8_param_gather"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_8"}, "paths": ["<root>.trainer.strategy.ddp"]}, "tuple_9": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "gradient_as_bucket_view"]], "metadata": null}, "dict_17": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='gradient_as_bucket_view')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.kwargs['gradient_as_bucket_view']"]}]], "metadata": {"type": "ref", "key": "tuple_9"}, "paths": ["<root>.trainer.strategy.kwargs"]}, "tuple_10": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "context_parallel_size"], ["Index(index=4)", "sequence_parallel"], ["Index(index=5)", "ddp"], ["Index(index=6)", "pipeline_dtype"], ["Index(index=7)", "ckpt_async_save"], ["Index(index=8)", "ckpt_parallel_load"], ["Index(index=9)", "kwargs"]], "metadata": null}, "dict_18": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_19": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_9": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.strategies.megatron_strategy", "name": "MegatronStrategy"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_10"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_18"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_19"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_strategy_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.virtual_pipeline_model_parallel_size"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.context_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.sequence_parallel"]}], ["Attr(name='ddp')", {"type": "ref", "key": "distributed_data_parallel_config_1"}], ["Attr(name='pipeline_dtype')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.pipeline_dtype"]}], ["Attr(name='ckpt_async_save')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_async_save"]}], ["Attr(name='ckpt_parallel_load')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_parallel_load"]}], ["Attr(name='kwargs')", {"type": "ref", "key": "dict_17"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_9"}, "paths": ["<root>.trainer.strategy"]}, "timing_callback_1": {"type": {"type": "pyref", "module": "nemo.utils.exp_manager", "name": "TimingCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "cf37c7b6-77c3-44d0-905c-5082b4d0580a", "paths": ["<root>.trainer.callbacks[0]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[0]"]}, "garbage_collection_callback_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.callbacks.garbage_collection", "name": "GarbageCollectionCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "879f2755-e403-4434-84bd-93beb6106877", "paths": ["<root>.trainer.callbacks[1]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[1]"]}, "list_1": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "ref", "key": "timing_callback_1"}], ["Index(index=1)", {"type": "ref", "key": "garbage_collection_callback_1"}]], "metadata": null, "paths": ["<root>.trainer.callbacks"]}, "megatron_mixed_precision_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.plugins.mixed_precision", "name": "MegatronMixedPrecision"}, "items": [["IdentityElement()", {"type": "leaf", "value": "964a8138-0ed9-4e86-94a3-22b30e4b6906", "paths": ["<root>.trainer.plugins"]}]], "metadata": null, "paths": ["<root>.trainer.plugins"]}, "tuple_11": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "accelerator"], ["Index(index=1)", "strategy"], ["Index(index=2)", "devices"], ["Index(index=3)", "num_nodes"], ["Index(index=4)", "callbacks"], ["Index(index=5)", "max_steps"], ["Index(index=6)", "limit_val_batches"], ["Index(index=7)", "val_check_interval"], ["Index(index=8)", "log_every_n_steps"], ["Index(index=9)", "accumulate_grad_batches"], ["Index(index=10)", "use_distributed_sampler"], ["Index(index=11)", "plugins"]], "metadata": null}, "dict_20": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_21": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_10": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.trainer", "name": "Trainer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_11"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_20"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_21"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='accelerator')", {"type": "leaf", "value": "gpu", "paths": ["<root>.trainer.accelerator"]}], ["Attr(name='strategy')", {"type": "ref", "key": "megatron_strategy_1"}], ["Attr(name='devices')", {"type": "leaf", "value": 8, "paths": ["<root>.trainer.devices"]}], ["Attr(name='num_nodes')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.num_nodes"]}], ["Attr(name='callbacks')", {"type": "ref", "key": "list_1"}], ["Attr(name='max_steps')", {"type": "leaf", "value": 1168251, "paths": ["<root>.trainer.max_steps"]}], ["Attr(name='limit_val_batches')", {"type": "leaf", "value": 32, "paths": ["<root>.trainer.limit_val_batches"]}], ["Attr(name='val_check_interval')", {"type": "leaf", "value": 100, "paths": ["<root>.trainer.val_check_interval"]}], ["Attr(name='log_every_n_steps')", {"type": "leaf", "value": 10, "paths": ["<root>.trainer.log_every_n_steps"]}], ["Attr(name='accumulate_grad_batches')", {"type": "leaf", "value": 4, "paths": ["<root>.trainer.accumulate_grad_batches"]}], ["Attr(name='use_distributed_sampler')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.use_distributed_sampler"]}], ["Attr(name='plugins')", {"type": "ref", "key": "megatron_mixed_precision_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_10"}, "paths": ["<root>.trainer"]}, "list_2": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "leaf", "value": "Data/dclm_local_shard_1_megatron/concatenated.jsonl_text_document", "paths": ["<root>.extra['datamodule'].paths[0]"]}]], "metadata": null, "paths": ["<root>.extra['datamodule'].paths"]}, "tuple_12": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_22": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_23": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_11": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_12"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_22"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_23"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.extra['datamodule'].tokenizer.pretrained_model_name"]}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "Data/tokenizer/tokenizer_config.json", "paths": ["<root>.extra['datamodule'].tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.extra['datamodule'].tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_11"}, "paths": ["<root>.extra['datamodule'].tokenizer"]}, "tuple_13": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "paths"], ["Index(index=1)", "seq_length"], ["Index(index=2)", "tokenizer"], ["Index(index=3)", "micro_batch_size"], ["Index(index=4)", "global_batch_size"], ["Index(index=5)", "split"], ["Index(index=6)", "index_mapping_dir"]], "metadata": null}, "dict_24": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_25": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_12": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.data.pre_training", "name": "PreTrainingDataModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_13"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_24"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_25"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "pre_training_data_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='paths')", {"type": "ref", "key": "list_2"}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.extra['datamodule'].seq_length"]}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_2"}], ["Attr(name='micro_batch_size')", {"type": "leaf", "value": 16, "paths": ["<root>.extra['datamodule'].micro_batch_size"]}], ["Attr(name='global_batch_size')", {"type": "leaf", "value": 512, "paths": ["<root>.extra['datamodule'].global_batch_size"]}], ["Attr(name='split')", {"type": "leaf", "value": "99,8,2", "paths": ["<root>.extra['datamodule'].split"]}], ["Attr(name='index_mapping_dir')", {"type": "leaf", "value": "Data/index_mapping_local_shard_1", "paths": ["<root>.extra['datamodule'].index_mapping_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_12"}, "paths": ["<root>.extra['datamodule']"]}, "tuple_14": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "datamodule"]], "metadata": null}, "dict_26": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='datamodule')", {"type": "ref", "key": "pre_training_data_module_1"}]], "metadata": {"type": "ref", "key": "tuple_14"}, "paths": ["<root>.extra"]}, "tuple_15": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "model"], ["Index(index=1)", "trainer"], ["Index(index=2)", "extra"]], "metadata": null}, "dict_27": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_28": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_13": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.pl", "name": "TrainerContext"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_15"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_27"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_28"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_context_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='model')", {"type": "ref", "key": "llama_model_1"}], ["Attr(name='trainer')", {"type": "ref", "key": "trainer_1"}], ["Attr(name='extra')", {"type": "ref", "key": "dict_26"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_13"}, "paths": ["<root>"]}}, "refcounts": {"tuple_1": 1, "dict_1": 1, "dict_2": 1, "buildable_traverser_metadata_1": 1, "llama32_config1_b_1": 1, "tuple_2": 1, "dict_3": 1, "dict_4": 1, "buildable_traverser_metadata_2": 1, "optimizer_config_1": 1, "tuple_3": 1, "dict_5": 1, "dict_6": 1, "buildable_traverser_metadata_3": 1, "cosine_annealing_scheduler_1": 1, "tuple_4": 1, "dict_7": 1, "dict_8": 1, "buildable_traverser_metadata_4": 1, "megatron_optimizer_module_1": 1, "tuple_5": 1, "dict_9": 1, "dict_10": 1, "buildable_traverser_metadata_5": 1, "dir_or_string_artifact_1": 1, "tuple_6": 1, "dict_11": 1, "dict_12": 1, "buildable_traverser_metadata_6": 1, "auto_tokenizer_1": 1, "tuple_7": 1, "dict_13": 1, "dict_14": 1, "buildable_traverser_metadata_7": 1, "llama_model_1": 1, "tuple_8": 1, "dict_15": 1, "dict_16": 1, "buildable_traverser_metadata_8": 1, "distributed_data_parallel_config_1": 1, "tuple_9": 1, "dict_17": 1, "tuple_10": 1, "dict_18": 1, "dict_19": 1, "buildable_traverser_metadata_9": 1, "megatron_strategy_1": 1, "timing_callback_1": 1, "garbage_collection_callback_1": 1, "list_1": 1, "megatron_mixed_precision_1": 1, "tuple_11": 1, "dict_20": 1, "dict_21": 1, "buildable_traverser_metadata_10": 1, "trainer_1": 1, "list_2": 1, "tuple_12": 1, "dict_22": 1, "dict_23": 1, "buildable_traverser_metadata_11": 1, "auto_tokenizer_2": 1, "tuple_13": 1, "dict_24": 1, "dict_25": 1, "buildable_traverser_metadata_12": 1, "pre_training_data_module_1": 1, "tuple_14": 1, "dict_26": 1, "tuple_15": 1, "dict_27": 1, "dict_28": 1, "buildable_traverser_metadata_13": 1, "trainer_context_1": 1}, "version": "0.0.1"}
model_name=0--step=1274-consumed_samples=652800.0-last/context/model.yaml ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: nemo.collections.llm.gpt.model.llama.LlamaModel
2
+ config:
3
+ _cpu_offloading_context: null
4
+ _target_: nemo.collections.llm.gpt.model.llama.Llama32Config1B
5
+ account_for_embedding_in_pipeline_split: false
6
+ account_for_loss_in_pipeline_split: false
7
+ activation_func:
8
+ _call_: false
9
+ _target_: torch.nn.functional.silu
10
+ activation_func_fp8_input_store: false
11
+ add_bias_linear: false
12
+ add_qkv_bias: false
13
+ apply_query_key_layer_scaling: false
14
+ apply_residual_connection_post_layernorm: false
15
+ apply_rope_fusion: true
16
+ async_tensor_model_parallel_allreduce: false
17
+ attention_backend:
18
+ _call_: true
19
+ _target_: megatron.core.transformer.enums.AttnBackend
20
+ attention_dropout: 0.0
21
+ attention_softmax_in_fp32: false
22
+ autocast_dtype:
23
+ _call_: false
24
+ _target_: torch.bfloat16
25
+ barrier_with_L1_time: true
26
+ batch_p2p_comm: true
27
+ batch_p2p_sync: true
28
+ bf16: true
29
+ bias_activation_fusion: true
30
+ bias_dropout_fusion: true
31
+ calculate_per_token_loss: false
32
+ clone_scatter_output_in_embedding: true
33
+ config_logger_dir: ''
34
+ context_parallel_size: 1
35
+ cp_comm_type: null
36
+ cpu_offloading: false
37
+ cpu_offloading_activations: true
38
+ cpu_offloading_num_layers: 0
39
+ cpu_offloading_weights: true
40
+ cross_entropy_loss_fusion: true
41
+ cuda_graph_retain_backward_graph: false
42
+ cuda_graph_use_single_mempool: false
43
+ cuda_graph_warmup_steps: 3
44
+ data_step_fn:
45
+ _call_: false
46
+ _target_: nemo.collections.llm.gpt.model.base.gpt_data_step
47
+ deallocate_pipeline_outputs: true
48
+ defer_embedding_wgrad_compute: false
49
+ deterministic_mode: false
50
+ disable_parameter_transpose_cache: false
51
+ distribute_saved_activations: null
52
+ enable_autocast: false
53
+ enable_cuda_graph: false
54
+ expert_model_parallel_size: 1
55
+ expert_tensor_parallel_size: null
56
+ external_cuda_graph: false
57
+ ffn_hidden_size: 8192
58
+ finalize_model_grads_func: null
59
+ flash_decode: false
60
+ forward_step_fn:
61
+ _call_: false
62
+ _target_: nemo.collections.llm.gpt.model.base.gpt_forward_step
63
+ fp16: false
64
+ fp16_lm_cross_entropy: false
65
+ fp32_residual_connection: false
66
+ fp8: null
67
+ fp8_amax_compute_algo: most_recent
68
+ fp8_amax_history_len: 1
69
+ fp8_dot_product_attention: false
70
+ fp8_interval: 1
71
+ fp8_margin: 0
72
+ fp8_multi_head_attention: false
73
+ fp8_wgrad: true
74
+ gated_linear_unit: true
75
+ grad_scale_func: null
76
+ grad_sync_func: null
77
+ gradient_accumulation_fusion: true
78
+ hidden_dropout: 0.0
79
+ hidden_size: 2048
80
+ hierarchical_context_parallel_sizes: null
81
+ high_freq_factor: 4
82
+ inference_rng_tracker: false
83
+ init_method: null
84
+ init_method_std: 0.02
85
+ kv_channels: null
86
+ layernorm_epsilon: 1.0e-05
87
+ layernorm_zero_centered_gamma: false
88
+ low_freq_factor: 1
89
+ make_vocab_size_divisible_by: 128
90
+ masked_softmax_fusion: true
91
+ memory_efficient_layer_norm: false
92
+ microbatch_group_size_per_vp_stage: 1
93
+ moe_aux_loss_coeff: 0
94
+ moe_expert_capacity_factor: null
95
+ moe_extended_tp: false
96
+ moe_ffn_hidden_size: null
97
+ moe_grouped_gemm: false
98
+ moe_input_jitter_eps: null
99
+ moe_layer_freq: 1
100
+ moe_layer_recompute: false
101
+ moe_pad_expert_input_to_capacity: false
102
+ moe_per_layer_logging: false
103
+ moe_permute_fusion: false
104
+ moe_router_bias_update_rate: 0.001
105
+ moe_router_enable_expert_bias: false
106
+ moe_router_group_topk: null
107
+ moe_router_load_balancing_type: aux_loss
108
+ moe_router_num_groups: null
109
+ moe_router_pre_softmax: false
110
+ moe_router_score_function: softmax
111
+ moe_router_topk: 2
112
+ moe_router_topk_limited_devices: null
113
+ moe_router_topk_scaling_factor: null
114
+ moe_shared_expert_intermediate_size: null
115
+ moe_shared_expert_overlap: false
116
+ moe_token_dispatcher_type: allgather
117
+ moe_token_drop_policy: probs
118
+ moe_token_dropping: false
119
+ moe_use_legacy_grouped_gemm: false
120
+ moe_z_loss_coeff: null
121
+ multi_latent_attention: false
122
+ no_sync_func: null
123
+ normalization: RMSNorm
124
+ num_attention_heads: 32
125
+ num_layers: 16
126
+ num_layers_in_first_pipeline_stage: null
127
+ num_layers_in_last_pipeline_stage: null
128
+ num_microbatches_with_partial_activation_checkpoints: null
129
+ num_moe_experts: null
130
+ num_query_groups: 8
131
+ old_context_len: 8192
132
+ output_layer_init_method: null
133
+ overlap_p2p_comm: false
134
+ overlap_p2p_comm_warmup_flush: false
135
+ parallel_output: true
136
+ param_sync_func: null
137
+ params_dtype:
138
+ _call_: false
139
+ _target_: torch.bfloat16
140
+ perform_initialization: true
141
+ persist_layer_norm: true
142
+ pipeline_dtype:
143
+ _call_: false
144
+ _target_: torch.bfloat16
145
+ pipeline_model_parallel_size: 1
146
+ pipeline_model_parallel_split_rank: null
147
+ position_embedding_type: rope
148
+ qk_layernorm: false
149
+ recompute_granularity: null
150
+ recompute_method: null
151
+ recompute_num_layers: null
152
+ rotary_base: 500000
153
+ rotary_interleaved: false
154
+ rotary_percent: 1.0
155
+ scale_factor: 32
156
+ scatter_embedding_sequence_parallel: true
157
+ seq_len_interpolation_factor: null
158
+ seq_length: 2048
159
+ sequence_parallel: false
160
+ share_embeddings_and_output_weights: true
161
+ softmax_scale: null
162
+ tensor_model_parallel_size: 1
163
+ test_mode: false
164
+ timers: null
165
+ tp_comm_atomic_ag: false
166
+ tp_comm_atomic_rs: false
167
+ tp_comm_bootstrap_backend: nccl
168
+ tp_comm_bulk_dgrad: true
169
+ tp_comm_bulk_wgrad: true
170
+ tp_comm_overlap: false
171
+ tp_comm_overlap_ag: true
172
+ tp_comm_overlap_disable_fc1: false
173
+ tp_comm_overlap_disable_qkv: false
174
+ tp_comm_overlap_rs: true
175
+ tp_comm_overlap_rs_dgrad: false
176
+ tp_comm_split_ag: true
177
+ tp_comm_split_rs: true
178
+ tp_only_amax_red: false
179
+ transformer_layer_spec:
180
+ _call_: false
181
+ _target_: nemo.collections.llm.gpt.model.base.default_layer_spec
182
+ use_cpu_initialization: false
183
+ use_ring_exchange_p2p: false
184
+ use_te_rng_tracker: false
185
+ use_transformer_engine_full_layer_spec: false
186
+ variable_seq_lengths: false
187
+ virtual_pipeline_model_parallel_size: null
188
+ wgrad_deferral_limit: 0
189
+ window_size: null
190
+ model_transform: null
191
+ optim:
192
+ _target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
193
+ config:
194
+ _target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
195
+ adam_beta1: 0.9
196
+ adam_beta2: 0.95
197
+ adam_eps: 1.0e-05
198
+ barrier_with_L1_time: false
199
+ bf16: true
200
+ clip_grad: 1.0
201
+ config_logger_dir: ''
202
+ decoupled_lr: null
203
+ decoupled_min_lr: null
204
+ exp_avg_dtype:
205
+ _call_: false
206
+ _target_: torch.float32
207
+ exp_avg_sq_dtype:
208
+ _call_: false
209
+ _target_: torch.float32
210
+ fp16: false
211
+ hysteresis: 2
212
+ initial_loss_scale: 4294967296
213
+ log_num_zeros_in_grad: false
214
+ loss_scale: null
215
+ loss_scale_window: 1000
216
+ lr: 0.0003
217
+ main_grads_dtype:
218
+ _call_: false
219
+ _target_: torch.float32
220
+ main_params_dtype:
221
+ _call_: false
222
+ _target_: torch.float32
223
+ min_loss_scale: 1.0
224
+ min_lr: null
225
+ optimizer: adam
226
+ overlap_param_gather_with_optimizer_step: false
227
+ params_dtype:
228
+ _call_: false
229
+ _target_: torch.float32
230
+ sgd_momentum: 0.9
231
+ timers: null
232
+ use_distributed_optimizer: true
233
+ use_precision_aware_optimizer: false
234
+ weight_decay: 0.1
235
+ lr_mult: 1.0
236
+ lr_scheduler:
237
+ _target_: nemo.lightning.pytorch.optim.lr_scheduler.CosineAnnealingScheduler
238
+ constant_steps: 0
239
+ frequency: 1
240
+ interval: step
241
+ max_steps: 10
242
+ min_lr: 2.9999999999999997e-05
243
+ monitor: val_loss
244
+ warmup_steps: 2000
245
+ no_weight_decay_cond: null
246
+ scale_lr_cond: null
247
+ tokenizer:
248
+ _target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
249
+ additional_special_tokens: []
250
+ bos_token: null
251
+ cls_token: null
252
+ eos_token: null
253
+ include_special_tokens: false
254
+ mask_token: null
255
+ merges_file: null
256
+ pad_token: null
257
+ pretrained_model_name:
258
+ _target_: nemo.lightning.io.artifact.file.DirOrStringArtifact
259
+ attr: allenai/OLMo-1B-hf
260
+ required: true
261
+ skip: true
262
+ sep_token: null
263
+ trust_remote_code: false
264
+ unk_token: null
265
+ use_fast: true
266
+ vocab_file: tokenizer_config.json
model_name=0--step=1274-consumed_samples=652800.0-last/context/tokenizer_config.json ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "|||IP_ADDRESS|||",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": false
13
+ },
14
+ "1": {
15
+ "content": "<|padding|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "50254": {
23
+ "content": " ",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "50255": {
31
+ "content": " ",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "50256": {
39
+ "content": " ",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "50257": {
47
+ "content": " ",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "50258": {
55
+ "content": " ",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "50259": {
63
+ "content": " ",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "50260": {
71
+ "content": " ",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "50261": {
79
+ "content": " ",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "50262": {
87
+ "content": " ",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "50263": {
95
+ "content": " ",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "50264": {
103
+ "content": " ",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "50265": {
111
+ "content": " ",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "50266": {
119
+ "content": " ",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "50267": {
127
+ "content": " ",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "50268": {
135
+ "content": " ",
136
+ "lstrip": false,
137
+ "normalized": true,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "50269": {
143
+ "content": " ",
144
+ "lstrip": false,
145
+ "normalized": true,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "50270": {
151
+ "content": " ",
152
+ "lstrip": false,
153
+ "normalized": true,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "50271": {
159
+ "content": " ",
160
+ "lstrip": false,
161
+ "normalized": true,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "50272": {
167
+ "content": " ",
168
+ "lstrip": false,
169
+ "normalized": true,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "50273": {
175
+ "content": " ",
176
+ "lstrip": false,
177
+ "normalized": true,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "50274": {
183
+ "content": " ",
184
+ "lstrip": false,
185
+ "normalized": true,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "50275": {
191
+ "content": " ",
192
+ "lstrip": false,
193
+ "normalized": true,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "50276": {
199
+ "content": " ",
200
+ "lstrip": false,
201
+ "normalized": true,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "50277": {
207
+ "content": "|||EMAIL_ADDRESS|||",
208
+ "lstrip": false,
209
+ "normalized": true,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "50278": {
215
+ "content": "|||PHONE_NUMBER|||",
216
+ "lstrip": false,
217
+ "normalized": true,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": false
221
+ },
222
+ "50279": {
223
+ "content": "<|endoftext|>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ }
230
+ },
231
+ "bos_token": null,
232
+ "clean_up_tokenization_spaces": true,
233
+ "eos_token": "<|endoftext|>",
234
+ "model_max_length": 1000000000000000019884624838656,
235
+ "pad_token": "<|padding|>",
236
+ "tokenizer_class": "GPTNeoXTokenizer",
237
+ "unk_token": null
238
+ }
model_name=0--step=1274-consumed_samples=652800.0-last/weights/common.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2d2a18d92d27b2fd631576811a6276c6c5f1510e8878e708869923e5a987ba1
3
+ size 9822
model_name=0--step=1299-consumed_samples=665600.0/weights/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cded9082390420e4d61cc3a44979d23bbb3d288da430922cf05e0c207eaacba9
3
+ size 314572800
model_name=0--step=1299-consumed_samples=665600.0/weights/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90d29f1d5b617e7a1f49a5e9d381eb99c136b8ea14f0dcbec823e6847774352c
3
+ size 350224384
model_name=0--step=1299-consumed_samples=665600.0/weights/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6be31ffe08454248b23cf96333045690b7725192e599f17ffcac41f5b3a49797
3
+ size 315621376
model_name=0--step=1299-consumed_samples=665600.0/weights/__5_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48dba4789f6388f739cbe422dfdddb868a638866a0ba332e390bc206e74af08e
3
+ size 314572800
model_name=0--step=399-consumed_samples=204800.0/weights/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:618d361e159a9f1617c9f17cc25485af56da13f8d94e74bb32ce9d992c5a8c8a
3
+ size 272079
model_name=0--step=399-consumed_samples=204800.0/weights/common.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99a824719957fc8ae54eecb5a738dcd3bfaf0a8352dfe76e4c78d036f922a3d4
3
+ size 6083
model_name=0--step=399-consumed_samples=204800.0/weights/metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
model_name=0--step=699-consumed_samples=358400.0/context/030e253d-d59d-444c-88ce-d1cc0887e916 ADDED
Binary file (202 Bytes). View file
 
model_name=0--step=699-consumed_samples=358400.0/context/dbaa4557-3c32-4811-8655-c6eedf50a52e ADDED
Binary file (584 Bytes). View file
 
model_name=0--step=699-consumed_samples=358400.0/context/fc700484-6697-44f7-8535-3c24e044c2d2 ADDED
Binary file (173 Bytes). View file
 
model_name=0--step=699-consumed_samples=358400.0/context/model.yaml ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: nemo.collections.llm.gpt.model.llama.LlamaModel
2
+ config:
3
+ _cpu_offloading_context: null
4
+ _target_: nemo.collections.llm.gpt.model.llama.Llama32Config1B
5
+ account_for_embedding_in_pipeline_split: false
6
+ account_for_loss_in_pipeline_split: false
7
+ activation_func:
8
+ _call_: false
9
+ _target_: torch.nn.functional.silu
10
+ activation_func_fp8_input_store: false
11
+ add_bias_linear: false
12
+ add_qkv_bias: false
13
+ apply_query_key_layer_scaling: false
14
+ apply_residual_connection_post_layernorm: false
15
+ apply_rope_fusion: true
16
+ async_tensor_model_parallel_allreduce: false
17
+ attention_backend:
18
+ _call_: true
19
+ _target_: megatron.core.transformer.enums.AttnBackend
20
+ attention_dropout: 0.0
21
+ attention_softmax_in_fp32: false
22
+ autocast_dtype:
23
+ _call_: false
24
+ _target_: torch.bfloat16
25
+ barrier_with_L1_time: true
26
+ batch_p2p_comm: true
27
+ batch_p2p_sync: true
28
+ bf16: true
29
+ bias_activation_fusion: true
30
+ bias_dropout_fusion: true
31
+ calculate_per_token_loss: false
32
+ clone_scatter_output_in_embedding: true
33
+ config_logger_dir: ''
34
+ context_parallel_size: 1
35
+ cp_comm_type: null
36
+ cpu_offloading: false
37
+ cpu_offloading_activations: true
38
+ cpu_offloading_num_layers: 0
39
+ cpu_offloading_weights: true
40
+ cross_entropy_loss_fusion: true
41
+ cuda_graph_retain_backward_graph: false
42
+ cuda_graph_use_single_mempool: false
43
+ cuda_graph_warmup_steps: 3
44
+ data_step_fn:
45
+ _call_: false
46
+ _target_: nemo.collections.llm.gpt.model.base.gpt_data_step
47
+ deallocate_pipeline_outputs: true
48
+ defer_embedding_wgrad_compute: false
49
+ deterministic_mode: false
50
+ disable_parameter_transpose_cache: false
51
+ distribute_saved_activations: null
52
+ enable_autocast: false
53
+ enable_cuda_graph: false
54
+ expert_model_parallel_size: 1
55
+ expert_tensor_parallel_size: null
56
+ external_cuda_graph: false
57
+ ffn_hidden_size: 8192
58
+ finalize_model_grads_func: null
59
+ flash_decode: false
60
+ forward_step_fn:
61
+ _call_: false
62
+ _target_: nemo.collections.llm.gpt.model.base.gpt_forward_step
63
+ fp16: false
64
+ fp16_lm_cross_entropy: false
65
+ fp32_residual_connection: false
66
+ fp8: null
67
+ fp8_amax_compute_algo: most_recent
68
+ fp8_amax_history_len: 1
69
+ fp8_dot_product_attention: false
70
+ fp8_interval: 1
71
+ fp8_margin: 0
72
+ fp8_multi_head_attention: false
73
+ fp8_wgrad: true
74
+ gated_linear_unit: true
75
+ grad_scale_func: null
76
+ grad_sync_func: null
77
+ gradient_accumulation_fusion: true
78
+ hidden_dropout: 0.0
79
+ hidden_size: 2048
80
+ hierarchical_context_parallel_sizes: null
81
+ high_freq_factor: 4
82
+ inference_rng_tracker: false
83
+ init_method: null
84
+ init_method_std: 0.02
85
+ kv_channels: null
86
+ layernorm_epsilon: 1.0e-05
87
+ layernorm_zero_centered_gamma: false
88
+ low_freq_factor: 1
89
+ make_vocab_size_divisible_by: 128
90
+ masked_softmax_fusion: true
91
+ memory_efficient_layer_norm: false
92
+ microbatch_group_size_per_vp_stage: 1
93
+ moe_aux_loss_coeff: 0
94
+ moe_expert_capacity_factor: null
95
+ moe_extended_tp: false
96
+ moe_ffn_hidden_size: null
97
+ moe_grouped_gemm: false
98
+ moe_input_jitter_eps: null
99
+ moe_layer_freq: 1
100
+ moe_layer_recompute: false
101
+ moe_pad_expert_input_to_capacity: false
102
+ moe_per_layer_logging: false
103
+ moe_permute_fusion: false
104
+ moe_router_bias_update_rate: 0.001
105
+ moe_router_enable_expert_bias: false
106
+ moe_router_group_topk: null
107
+ moe_router_load_balancing_type: aux_loss
108
+ moe_router_num_groups: null
109
+ moe_router_pre_softmax: false
110
+ moe_router_score_function: softmax
111
+ moe_router_topk: 2
112
+ moe_router_topk_limited_devices: null
113
+ moe_router_topk_scaling_factor: null
114
+ moe_shared_expert_intermediate_size: null
115
+ moe_shared_expert_overlap: false
116
+ moe_token_dispatcher_type: allgather
117
+ moe_token_drop_policy: probs
118
+ moe_token_dropping: false
119
+ moe_use_legacy_grouped_gemm: false
120
+ moe_z_loss_coeff: null
121
+ multi_latent_attention: false
122
+ no_sync_func: null
123
+ normalization: RMSNorm
124
+ num_attention_heads: 32
125
+ num_layers: 16
126
+ num_layers_in_first_pipeline_stage: null
127
+ num_layers_in_last_pipeline_stage: null
128
+ num_microbatches_with_partial_activation_checkpoints: null
129
+ num_moe_experts: null
130
+ num_query_groups: 8
131
+ old_context_len: 8192
132
+ output_layer_init_method: null
133
+ overlap_p2p_comm: false
134
+ overlap_p2p_comm_warmup_flush: false
135
+ parallel_output: true
136
+ param_sync_func: null
137
+ params_dtype:
138
+ _call_: false
139
+ _target_: torch.bfloat16
140
+ perform_initialization: true
141
+ persist_layer_norm: true
142
+ pipeline_dtype:
143
+ _call_: false
144
+ _target_: torch.bfloat16
145
+ pipeline_model_parallel_size: 1
146
+ pipeline_model_parallel_split_rank: null
147
+ position_embedding_type: rope
148
+ qk_layernorm: false
149
+ recompute_granularity: null
150
+ recompute_method: null
151
+ recompute_num_layers: null
152
+ rotary_base: 500000
153
+ rotary_interleaved: false
154
+ rotary_percent: 1.0
155
+ scale_factor: 32
156
+ scatter_embedding_sequence_parallel: true
157
+ seq_len_interpolation_factor: null
158
+ seq_length: 2048
159
+ sequence_parallel: false
160
+ share_embeddings_and_output_weights: true
161
+ softmax_scale: null
162
+ tensor_model_parallel_size: 1
163
+ test_mode: false
164
+ timers: null
165
+ tp_comm_atomic_ag: false
166
+ tp_comm_atomic_rs: false
167
+ tp_comm_bootstrap_backend: nccl
168
+ tp_comm_bulk_dgrad: true
169
+ tp_comm_bulk_wgrad: true
170
+ tp_comm_overlap: false
171
+ tp_comm_overlap_ag: true
172
+ tp_comm_overlap_disable_fc1: false
173
+ tp_comm_overlap_disable_qkv: false
174
+ tp_comm_overlap_rs: true
175
+ tp_comm_overlap_rs_dgrad: false
176
+ tp_comm_split_ag: true
177
+ tp_comm_split_rs: true
178
+ tp_only_amax_red: false
179
+ transformer_layer_spec:
180
+ _call_: false
181
+ _target_: nemo.collections.llm.gpt.model.base.default_layer_spec
182
+ use_cpu_initialization: false
183
+ use_ring_exchange_p2p: false
184
+ use_te_rng_tracker: false
185
+ use_transformer_engine_full_layer_spec: false
186
+ variable_seq_lengths: false
187
+ virtual_pipeline_model_parallel_size: null
188
+ wgrad_deferral_limit: 0
189
+ window_size: null
190
+ model_transform: null
191
+ optim:
192
+ _target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
193
+ config:
194
+ _target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
195
+ adam_beta1: 0.9
196
+ adam_beta2: 0.95
197
+ adam_eps: 1.0e-05
198
+ barrier_with_L1_time: false
199
+ bf16: true
200
+ clip_grad: 1.0
201
+ config_logger_dir: ''
202
+ decoupled_lr: null
203
+ decoupled_min_lr: null
204
+ exp_avg_dtype:
205
+ _call_: false
206
+ _target_: torch.float32
207
+ exp_avg_sq_dtype:
208
+ _call_: false
209
+ _target_: torch.float32
210
+ fp16: false
211
+ hysteresis: 2
212
+ initial_loss_scale: 4294967296
213
+ log_num_zeros_in_grad: false
214
+ loss_scale: null
215
+ loss_scale_window: 1000
216
+ lr: 0.0003
217
+ main_grads_dtype:
218
+ _call_: false
219
+ _target_: torch.float32
220
+ main_params_dtype:
221
+ _call_: false
222
+ _target_: torch.float32
223
+ min_loss_scale: 1.0
224
+ min_lr: null
225
+ optimizer: adam
226
+ overlap_param_gather_with_optimizer_step: false
227
+ params_dtype:
228
+ _call_: false
229
+ _target_: torch.float32
230
+ sgd_momentum: 0.9
231
+ timers: null
232
+ use_distributed_optimizer: true
233
+ use_precision_aware_optimizer: false
234
+ weight_decay: 0.1
235
+ lr_mult: 1.0
236
+ lr_scheduler:
237
+ _target_: nemo.lightning.pytorch.optim.lr_scheduler.CosineAnnealingScheduler
238
+ constant_steps: 0
239
+ frequency: 1
240
+ interval: step
241
+ max_steps: 10
242
+ min_lr: 2.9999999999999997e-05
243
+ monitor: val_loss
244
+ warmup_steps: 2000
245
+ no_weight_decay_cond: null
246
+ scale_lr_cond: null
247
+ tokenizer:
248
+ _target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
249
+ additional_special_tokens: []
250
+ bos_token: null
251
+ cls_token: null
252
+ eos_token: null
253
+ include_special_tokens: false
254
+ mask_token: null
255
+ merges_file: null
256
+ pad_token: null
257
+ pretrained_model_name:
258
+ _target_: nemo.lightning.io.artifact.file.DirOrStringArtifact
259
+ attr: allenai/OLMo-1B-hf
260
+ required: true
261
+ skip: true
262
+ sep_token: null
263
+ trust_remote_code: false
264
+ unk_token: null
265
+ use_fast: true
266
+ vocab_file: tokenizer_config.json
model_name=0--step=699-consumed_samples=358400.0/context/tokenizer_config.json ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "|||IP_ADDRESS|||",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": false
13
+ },
14
+ "1": {
15
+ "content": "<|padding|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "50254": {
23
+ "content": " ",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "50255": {
31
+ "content": " ",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "50256": {
39
+ "content": " ",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "50257": {
47
+ "content": " ",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "50258": {
55
+ "content": " ",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "50259": {
63
+ "content": " ",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "50260": {
71
+ "content": " ",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "50261": {
79
+ "content": " ",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "50262": {
87
+ "content": " ",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "50263": {
95
+ "content": " ",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "50264": {
103
+ "content": " ",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "50265": {
111
+ "content": " ",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "50266": {
119
+ "content": " ",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "50267": {
127
+ "content": " ",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "50268": {
135
+ "content": " ",
136
+ "lstrip": false,
137
+ "normalized": true,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "50269": {
143
+ "content": " ",
144
+ "lstrip": false,
145
+ "normalized": true,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "50270": {
151
+ "content": " ",
152
+ "lstrip": false,
153
+ "normalized": true,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "50271": {
159
+ "content": " ",
160
+ "lstrip": false,
161
+ "normalized": true,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "50272": {
167
+ "content": " ",
168
+ "lstrip": false,
169
+ "normalized": true,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "50273": {
175
+ "content": " ",
176
+ "lstrip": false,
177
+ "normalized": true,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "50274": {
183
+ "content": " ",
184
+ "lstrip": false,
185
+ "normalized": true,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "50275": {
191
+ "content": " ",
192
+ "lstrip": false,
193
+ "normalized": true,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "50276": {
199
+ "content": " ",
200
+ "lstrip": false,
201
+ "normalized": true,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "50277": {
207
+ "content": "|||EMAIL_ADDRESS|||",
208
+ "lstrip": false,
209
+ "normalized": true,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "50278": {
215
+ "content": "|||PHONE_NUMBER|||",
216
+ "lstrip": false,
217
+ "normalized": true,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": false
221
+ },
222
+ "50279": {
223
+ "content": "<|endoftext|>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ }
230
+ },
231
+ "bos_token": null,
232
+ "clean_up_tokenization_spaces": true,
233
+ "eos_token": "<|endoftext|>",
234
+ "model_max_length": 1000000000000000019884624838656,
235
+ "pad_token": "<|padding|>",
236
+ "tokenizer_class": "GPTNeoXTokenizer",
237
+ "unk_token": null
238
+ }
model_name=0--step=799-consumed_samples=409600.0/context/3a9305c4-7453-4cad-a094-f9f6012b5392 ADDED
Binary file (202 Bytes). View file
 
model_name=0--step=799-consumed_samples=409600.0/context/8d43aadf-2f71-4c43-864a-951b87890162 ADDED
Binary file (584 Bytes). View file
 
model_name=0--step=799-consumed_samples=409600.0/context/fa28f4cf-dc94-495c-900b-a556df2fe4c0 ADDED
Binary file (173 Bytes). View file
 
model_name=0--step=799-consumed_samples=409600.0/context/io.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"root": {"type": "ref", "key": "trainer_context_1"}, "objects": {"tuple_1": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "sequence_parallel"], ["Index(index=4)", "context_parallel_size"], ["Index(index=5)", "expert_model_parallel_size"], ["Index(index=6)", "expert_tensor_parallel_size"], ["Index(index=7)", "moe_extended_tp"], ["Index(index=8)", "bf16"], ["Index(index=9)", "params_dtype"], ["Index(index=10)", "autocast_dtype"], ["Index(index=11)", "use_te_rng_tracker"], ["Index(index=12)", "pipeline_dtype"], ["Index(index=13)", "microbatch_group_size_per_vp_stage"], ["Index(index=14)", "account_for_embedding_in_pipeline_split"], ["Index(index=15)", "account_for_loss_in_pipeline_split"], ["Index(index=16)", "share_embeddings_and_output_weights"], ["Index(index=17)", "seq_length"]], "metadata": null}, "dict_1": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_2": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "Llama32Config1B"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_1"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_1"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_2"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama32_config1_b_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.virtual_pipeline_model_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.sequence_parallel"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.context_parallel_size"]}], ["Attr(name='expert_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.expert_model_parallel_size"]}], ["Attr(name='expert_tensor_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.expert_tensor_parallel_size"]}], ["Attr(name='moe_extended_tp')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.moe_extended_tp"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='autocast_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='use_te_rng_tracker')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.use_te_rng_tracker"]}], ["Attr(name='pipeline_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='microbatch_group_size_per_vp_stage')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.microbatch_group_size_per_vp_stage"]}], ["Attr(name='account_for_embedding_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_embedding_in_pipeline_split"]}], ["Attr(name='account_for_loss_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_loss_in_pipeline_split"]}], ["Attr(name='share_embeddings_and_output_weights')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.share_embeddings_and_output_weights"]}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.model.config.seq_length"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_1"}, "paths": ["<root>.model.config"]}, "tuple_2": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "optimizer"], ["Index(index=1)", "lr"], ["Index(index=2)", "min_lr"], ["Index(index=3)", "decoupled_lr"], ["Index(index=4)", "decoupled_min_lr"], ["Index(index=5)", "weight_decay"], ["Index(index=6)", "fp16"], ["Index(index=7)", "bf16"], ["Index(index=8)", "params_dtype"], ["Index(index=9)", "use_precision_aware_optimizer"], ["Index(index=10)", "main_grads_dtype"], ["Index(index=11)", "main_params_dtype"], ["Index(index=12)", "exp_avg_dtype"], ["Index(index=13)", "exp_avg_sq_dtype"], ["Index(index=14)", "loss_scale"], ["Index(index=15)", "initial_loss_scale"], ["Index(index=16)", "min_loss_scale"], ["Index(index=17)", "loss_scale_window"], ["Index(index=18)", "hysteresis"], ["Index(index=19)", "adam_beta1"], ["Index(index=20)", "adam_beta2"], ["Index(index=21)", "adam_eps"], ["Index(index=22)", "sgd_momentum"], ["Index(index=23)", "use_distributed_optimizer"], ["Index(index=24)", "overlap_param_gather_with_optimizer_step"], ["Index(index=25)", "clip_grad"], ["Index(index=26)", "log_num_zeros_in_grad"], ["Index(index=27)", "barrier_with_L1_time"], ["Index(index=28)", "timers"], ["Index(index=29)", "config_logger_dir"]], "metadata": null}, "dict_3": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_4": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.optimizer.optimizer_config", "name": "OptimizerConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_2"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_3"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_4"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "optimizer_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='optimizer')", {"type": "leaf", "value": "adam", "paths": ["<root>.model.optim.config.optimizer"]}], ["Attr(name='lr')", {"type": "leaf", "value": 0.0003, "paths": ["<root>.model.optim.config.lr"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.min_lr"]}], ["Attr(name='decoupled_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_lr"]}], ["Attr(name='decoupled_min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_min_lr"]}], ["Attr(name='weight_decay')", {"type": "leaf", "value": 0.1, "paths": ["<root>.model.optim.config.weight_decay"]}], ["Attr(name='fp16')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.fp16"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='use_precision_aware_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.use_precision_aware_optimizer"]}], ["Attr(name='main_grads_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='main_params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_sq_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='loss_scale')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.loss_scale"]}], ["Attr(name='initial_loss_scale')", {"type": "leaf", "value": 4294967296, "paths": ["<root>.model.optim.config.initial_loss_scale"]}], ["Attr(name='min_loss_scale')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.min_loss_scale"]}], ["Attr(name='loss_scale_window')", {"type": "leaf", "value": 1000, "paths": ["<root>.model.optim.config.loss_scale_window"]}], ["Attr(name='hysteresis')", {"type": "leaf", "value": 2, "paths": ["<root>.model.optim.config.hysteresis"]}], ["Attr(name='adam_beta1')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.adam_beta1"]}], ["Attr(name='adam_beta2')", {"type": "leaf", "value": 0.95, "paths": ["<root>.model.optim.config.adam_beta2"]}], ["Attr(name='adam_eps')", {"type": "leaf", "value": 1e-05, "paths": ["<root>.model.optim.config.adam_eps"]}], ["Attr(name='sgd_momentum')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.sgd_momentum"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.use_distributed_optimizer"]}], ["Attr(name='overlap_param_gather_with_optimizer_step')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.overlap_param_gather_with_optimizer_step"]}], ["Attr(name='clip_grad')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.clip_grad"]}], ["Attr(name='log_num_zeros_in_grad')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.log_num_zeros_in_grad"]}], ["Attr(name='barrier_with_L1_time')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.barrier_with_L1_time"]}], ["Attr(name='timers')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.timers"]}], ["Attr(name='config_logger_dir')", {"type": "leaf", "value": "", "paths": ["<root>.model.optim.config.config_logger_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_2"}, "paths": ["<root>.model.optim.config"]}, "tuple_3": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "warmup_steps"], ["Index(index=1)", "constant_steps"], ["Index(index=2)", "min_lr"]], "metadata": null}, "dict_5": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_6": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_3": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.lr_scheduler", "name": "CosineAnnealingScheduler"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_3"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_5"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_6"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "cosine_annealing_scheduler_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='warmup_steps')", {"type": "leaf", "value": 2000, "paths": ["<root>.model.optim.lr_scheduler.warmup_steps"]}], ["Attr(name='constant_steps')", {"type": "leaf", "value": 0, "paths": ["<root>.model.optim.lr_scheduler.constant_steps"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": 2.9999999999999997e-05, "paths": ["<root>.model.optim.lr_scheduler.min_lr"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_3"}, "paths": ["<root>.model.optim.lr_scheduler"]}, "tuple_4": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "lr_scheduler"]], "metadata": null}, "dict_7": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_8": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_4": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.megatron", "name": "MegatronOptimizerModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_4"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_7"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_8"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_optimizer_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "optimizer_config_1"}], ["Attr(name='lr_scheduler')", {"type": "ref", "key": "cosine_annealing_scheduler_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_4"}, "paths": ["<root>.model.optim"]}, "tuple_5": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "attr"], ["Index(index=1)", "skip"]], "metadata": null}, "dict_9": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_10": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_5": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.artifact.file", "name": "DirOrStringArtifact"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_5"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_9"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_10"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "dir_or_string_artifact_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='attr')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.model.tokenizer.pretrained_model_name.attr"]}], ["Attr(name='skip')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.pretrained_model_name.skip"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_5"}, "paths": ["<root>.model.tokenizer.pretrained_model_name"]}, "tuple_6": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_11": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_12": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_6": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_6"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_11"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_12"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "ref", "key": "dir_or_string_artifact_1"}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "tokenizer_config.json", "paths": ["<root>.model.tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_6"}, "paths": ["<root>.model.tokenizer"]}, "tuple_7": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "optim"], ["Index(index=2)", "tokenizer"]], "metadata": null}, "dict_13": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_14": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_7": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "LlamaModel"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_7"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_13"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_14"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama_model_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "llama32_config1_b_1"}], ["Attr(name='optim')", {"type": "ref", "key": "megatron_optimizer_module_1"}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_7"}, "paths": ["<root>.model"]}, "tuple_8": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "grad_reduce_in_fp32"], ["Index(index=1)", "overlap_grad_reduce"], ["Index(index=2)", "overlap_param_gather"], ["Index(index=3)", "align_param_gather"], ["Index(index=4)", "use_distributed_optimizer"], ["Index(index=5)", "num_distributed_optimizer_instances"], ["Index(index=6)", "check_for_nan_in_grad"], ["Index(index=7)", "bucket_size"], ["Index(index=8)", "average_in_collective"], ["Index(index=9)", "fp8_param_gather"]], "metadata": null}, "dict_15": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_16": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_8": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.distributed.distributed_data_parallel_config", "name": "DistributedDataParallelConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_8"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_15"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_16"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "distributed_data_parallel_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='grad_reduce_in_fp32')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.grad_reduce_in_fp32"]}], ["Attr(name='overlap_grad_reduce')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_grad_reduce"]}], ["Attr(name='overlap_param_gather')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_param_gather"]}], ["Attr(name='align_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.align_param_gather"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.use_distributed_optimizer"]}], ["Attr(name='num_distributed_optimizer_instances')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.ddp.num_distributed_optimizer_instances"]}], ["Attr(name='check_for_nan_in_grad')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.check_for_nan_in_grad"]}], ["Attr(name='bucket_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.ddp.bucket_size"]}], ["Attr(name='average_in_collective')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.average_in_collective"]}], ["Attr(name='fp8_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.fp8_param_gather"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_8"}, "paths": ["<root>.trainer.strategy.ddp"]}, "tuple_9": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "gradient_as_bucket_view"]], "metadata": null}, "dict_17": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='gradient_as_bucket_view')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.kwargs['gradient_as_bucket_view']"]}]], "metadata": {"type": "ref", "key": "tuple_9"}, "paths": ["<root>.trainer.strategy.kwargs"]}, "tuple_10": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "context_parallel_size"], ["Index(index=4)", "sequence_parallel"], ["Index(index=5)", "ddp"], ["Index(index=6)", "pipeline_dtype"], ["Index(index=7)", "ckpt_async_save"], ["Index(index=8)", "ckpt_parallel_load"], ["Index(index=9)", "kwargs"]], "metadata": null}, "dict_18": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_19": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_9": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.strategies.megatron_strategy", "name": "MegatronStrategy"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_10"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_18"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_19"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_strategy_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.virtual_pipeline_model_parallel_size"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.context_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.sequence_parallel"]}], ["Attr(name='ddp')", {"type": "ref", "key": "distributed_data_parallel_config_1"}], ["Attr(name='pipeline_dtype')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.pipeline_dtype"]}], ["Attr(name='ckpt_async_save')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_async_save"]}], ["Attr(name='ckpt_parallel_load')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_parallel_load"]}], ["Attr(name='kwargs')", {"type": "ref", "key": "dict_17"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_9"}, "paths": ["<root>.trainer.strategy"]}, "timing_callback_1": {"type": {"type": "pyref", "module": "nemo.utils.exp_manager", "name": "TimingCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "3a9305c4-7453-4cad-a094-f9f6012b5392", "paths": ["<root>.trainer.callbacks[0]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[0]"]}, "garbage_collection_callback_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.callbacks.garbage_collection", "name": "GarbageCollectionCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "fa28f4cf-dc94-495c-900b-a556df2fe4c0", "paths": ["<root>.trainer.callbacks[1]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[1]"]}, "list_1": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "ref", "key": "timing_callback_1"}], ["Index(index=1)", {"type": "ref", "key": "garbage_collection_callback_1"}]], "metadata": null, "paths": ["<root>.trainer.callbacks"]}, "megatron_mixed_precision_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.plugins.mixed_precision", "name": "MegatronMixedPrecision"}, "items": [["IdentityElement()", {"type": "leaf", "value": "8d43aadf-2f71-4c43-864a-951b87890162", "paths": ["<root>.trainer.plugins"]}]], "metadata": null, "paths": ["<root>.trainer.plugins"]}, "tuple_11": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "accelerator"], ["Index(index=1)", "strategy"], ["Index(index=2)", "devices"], ["Index(index=3)", "num_nodes"], ["Index(index=4)", "callbacks"], ["Index(index=5)", "max_steps"], ["Index(index=6)", "limit_val_batches"], ["Index(index=7)", "val_check_interval"], ["Index(index=8)", "log_every_n_steps"], ["Index(index=9)", "accumulate_grad_batches"], ["Index(index=10)", "use_distributed_sampler"], ["Index(index=11)", "plugins"]], "metadata": null}, "dict_20": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_21": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_10": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.trainer", "name": "Trainer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_11"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_20"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_21"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='accelerator')", {"type": "leaf", "value": "gpu", "paths": ["<root>.trainer.accelerator"]}], ["Attr(name='strategy')", {"type": "ref", "key": "megatron_strategy_1"}], ["Attr(name='devices')", {"type": "leaf", "value": 8, "paths": ["<root>.trainer.devices"]}], ["Attr(name='num_nodes')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.num_nodes"]}], ["Attr(name='callbacks')", {"type": "ref", "key": "list_1"}], ["Attr(name='max_steps')", {"type": "leaf", "value": 1168251, "paths": ["<root>.trainer.max_steps"]}], ["Attr(name='limit_val_batches')", {"type": "leaf", "value": 32, "paths": ["<root>.trainer.limit_val_batches"]}], ["Attr(name='val_check_interval')", {"type": "leaf", "value": 100, "paths": ["<root>.trainer.val_check_interval"]}], ["Attr(name='log_every_n_steps')", {"type": "leaf", "value": 10, "paths": ["<root>.trainer.log_every_n_steps"]}], ["Attr(name='accumulate_grad_batches')", {"type": "leaf", "value": 4, "paths": ["<root>.trainer.accumulate_grad_batches"]}], ["Attr(name='use_distributed_sampler')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.use_distributed_sampler"]}], ["Attr(name='plugins')", {"type": "ref", "key": "megatron_mixed_precision_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_10"}, "paths": ["<root>.trainer"]}, "list_2": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "leaf", "value": "Data/dclm_local_shard_1_megatron/concatenated.jsonl_text_document", "paths": ["<root>.extra['datamodule'].paths[0]"]}]], "metadata": null, "paths": ["<root>.extra['datamodule'].paths"]}, "tuple_12": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_22": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_23": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_11": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_12"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_22"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_23"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.extra['datamodule'].tokenizer.pretrained_model_name"]}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "Data/tokenizer/tokenizer_config.json", "paths": ["<root>.extra['datamodule'].tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.extra['datamodule'].tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_11"}, "paths": ["<root>.extra['datamodule'].tokenizer"]}, "tuple_13": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "paths"], ["Index(index=1)", "seq_length"], ["Index(index=2)", "tokenizer"], ["Index(index=3)", "micro_batch_size"], ["Index(index=4)", "global_batch_size"], ["Index(index=5)", "split"], ["Index(index=6)", "index_mapping_dir"]], "metadata": null}, "dict_24": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_25": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_12": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.data.pre_training", "name": "PreTrainingDataModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_13"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_24"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_25"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "pre_training_data_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='paths')", {"type": "ref", "key": "list_2"}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.extra['datamodule'].seq_length"]}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_2"}], ["Attr(name='micro_batch_size')", {"type": "leaf", "value": 16, "paths": ["<root>.extra['datamodule'].micro_batch_size"]}], ["Attr(name='global_batch_size')", {"type": "leaf", "value": 512, "paths": ["<root>.extra['datamodule'].global_batch_size"]}], ["Attr(name='split')", {"type": "leaf", "value": "99,8,2", "paths": ["<root>.extra['datamodule'].split"]}], ["Attr(name='index_mapping_dir')", {"type": "leaf", "value": "Data/index_mapping_local_shard_1", "paths": ["<root>.extra['datamodule'].index_mapping_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_12"}, "paths": ["<root>.extra['datamodule']"]}, "tuple_14": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "datamodule"]], "metadata": null}, "dict_26": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='datamodule')", {"type": "ref", "key": "pre_training_data_module_1"}]], "metadata": {"type": "ref", "key": "tuple_14"}, "paths": ["<root>.extra"]}, "tuple_15": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "model"], ["Index(index=1)", "trainer"], ["Index(index=2)", "extra"]], "metadata": null}, "dict_27": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_28": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_13": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.pl", "name": "TrainerContext"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_15"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_27"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_28"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_context_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='model')", {"type": "ref", "key": "llama_model_1"}], ["Attr(name='trainer')", {"type": "ref", "key": "trainer_1"}], ["Attr(name='extra')", {"type": "ref", "key": "dict_26"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_13"}, "paths": ["<root>"]}}, "refcounts": {"tuple_1": 1, "dict_1": 1, "dict_2": 1, "buildable_traverser_metadata_1": 1, "llama32_config1_b_1": 1, "tuple_2": 1, "dict_3": 1, "dict_4": 1, "buildable_traverser_metadata_2": 1, "optimizer_config_1": 1, "tuple_3": 1, "dict_5": 1, "dict_6": 1, "buildable_traverser_metadata_3": 1, "cosine_annealing_scheduler_1": 1, "tuple_4": 1, "dict_7": 1, "dict_8": 1, "buildable_traverser_metadata_4": 1, "megatron_optimizer_module_1": 1, "tuple_5": 1, "dict_9": 1, "dict_10": 1, "buildable_traverser_metadata_5": 1, "dir_or_string_artifact_1": 1, "tuple_6": 1, "dict_11": 1, "dict_12": 1, "buildable_traverser_metadata_6": 1, "auto_tokenizer_1": 1, "tuple_7": 1, "dict_13": 1, "dict_14": 1, "buildable_traverser_metadata_7": 1, "llama_model_1": 1, "tuple_8": 1, "dict_15": 1, "dict_16": 1, "buildable_traverser_metadata_8": 1, "distributed_data_parallel_config_1": 1, "tuple_9": 1, "dict_17": 1, "tuple_10": 1, "dict_18": 1, "dict_19": 1, "buildable_traverser_metadata_9": 1, "megatron_strategy_1": 1, "timing_callback_1": 1, "garbage_collection_callback_1": 1, "list_1": 1, "megatron_mixed_precision_1": 1, "tuple_11": 1, "dict_20": 1, "dict_21": 1, "buildable_traverser_metadata_10": 1, "trainer_1": 1, "list_2": 1, "tuple_12": 1, "dict_22": 1, "dict_23": 1, "buildable_traverser_metadata_11": 1, "auto_tokenizer_2": 1, "tuple_13": 1, "dict_24": 1, "dict_25": 1, "buildable_traverser_metadata_12": 1, "pre_training_data_module_1": 1, "tuple_14": 1, "dict_26": 1, "tuple_15": 1, "dict_27": 1, "dict_28": 1, "buildable_traverser_metadata_13": 1, "trainer_context_1": 1}, "version": "0.0.1"}
model_name=0--step=799-consumed_samples=409600.0/context/model.yaml ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: nemo.collections.llm.gpt.model.llama.LlamaModel
2
+ config:
3
+ _cpu_offloading_context: null
4
+ _target_: nemo.collections.llm.gpt.model.llama.Llama32Config1B
5
+ account_for_embedding_in_pipeline_split: false
6
+ account_for_loss_in_pipeline_split: false
7
+ activation_func:
8
+ _call_: false
9
+ _target_: torch.nn.functional.silu
10
+ activation_func_fp8_input_store: false
11
+ add_bias_linear: false
12
+ add_qkv_bias: false
13
+ apply_query_key_layer_scaling: false
14
+ apply_residual_connection_post_layernorm: false
15
+ apply_rope_fusion: true
16
+ async_tensor_model_parallel_allreduce: false
17
+ attention_backend:
18
+ _call_: true
19
+ _target_: megatron.core.transformer.enums.AttnBackend
20
+ attention_dropout: 0.0
21
+ attention_softmax_in_fp32: false
22
+ autocast_dtype:
23
+ _call_: false
24
+ _target_: torch.bfloat16
25
+ barrier_with_L1_time: true
26
+ batch_p2p_comm: true
27
+ batch_p2p_sync: true
28
+ bf16: true
29
+ bias_activation_fusion: true
30
+ bias_dropout_fusion: true
31
+ calculate_per_token_loss: false
32
+ clone_scatter_output_in_embedding: true
33
+ config_logger_dir: ''
34
+ context_parallel_size: 1
35
+ cp_comm_type: null
36
+ cpu_offloading: false
37
+ cpu_offloading_activations: true
38
+ cpu_offloading_num_layers: 0
39
+ cpu_offloading_weights: true
40
+ cross_entropy_loss_fusion: true
41
+ cuda_graph_retain_backward_graph: false
42
+ cuda_graph_use_single_mempool: false
43
+ cuda_graph_warmup_steps: 3
44
+ data_step_fn:
45
+ _call_: false
46
+ _target_: nemo.collections.llm.gpt.model.base.gpt_data_step
47
+ deallocate_pipeline_outputs: true
48
+ defer_embedding_wgrad_compute: false
49
+ deterministic_mode: false
50
+ disable_parameter_transpose_cache: false
51
+ distribute_saved_activations: null
52
+ enable_autocast: false
53
+ enable_cuda_graph: false
54
+ expert_model_parallel_size: 1
55
+ expert_tensor_parallel_size: null
56
+ external_cuda_graph: false
57
+ ffn_hidden_size: 8192
58
+ finalize_model_grads_func: null
59
+ flash_decode: false
60
+ forward_step_fn:
61
+ _call_: false
62
+ _target_: nemo.collections.llm.gpt.model.base.gpt_forward_step
63
+ fp16: false
64
+ fp16_lm_cross_entropy: false
65
+ fp32_residual_connection: false
66
+ fp8: null
67
+ fp8_amax_compute_algo: most_recent
68
+ fp8_amax_history_len: 1
69
+ fp8_dot_product_attention: false
70
+ fp8_interval: 1
71
+ fp8_margin: 0
72
+ fp8_multi_head_attention: false
73
+ fp8_wgrad: true
74
+ gated_linear_unit: true
75
+ grad_scale_func: null
76
+ grad_sync_func: null
77
+ gradient_accumulation_fusion: true
78
+ hidden_dropout: 0.0
79
+ hidden_size: 2048
80
+ hierarchical_context_parallel_sizes: null
81
+ high_freq_factor: 4
82
+ inference_rng_tracker: false
83
+ init_method: null
84
+ init_method_std: 0.02
85
+ kv_channels: null
86
+ layernorm_epsilon: 1.0e-05
87
+ layernorm_zero_centered_gamma: false
88
+ low_freq_factor: 1
89
+ make_vocab_size_divisible_by: 128
90
+ masked_softmax_fusion: true
91
+ memory_efficient_layer_norm: false
92
+ microbatch_group_size_per_vp_stage: 1
93
+ moe_aux_loss_coeff: 0
94
+ moe_expert_capacity_factor: null
95
+ moe_extended_tp: false
96
+ moe_ffn_hidden_size: null
97
+ moe_grouped_gemm: false
98
+ moe_input_jitter_eps: null
99
+ moe_layer_freq: 1
100
+ moe_layer_recompute: false
101
+ moe_pad_expert_input_to_capacity: false
102
+ moe_per_layer_logging: false
103
+ moe_permute_fusion: false
104
+ moe_router_bias_update_rate: 0.001
105
+ moe_router_enable_expert_bias: false
106
+ moe_router_group_topk: null
107
+ moe_router_load_balancing_type: aux_loss
108
+ moe_router_num_groups: null
109
+ moe_router_pre_softmax: false
110
+ moe_router_score_function: softmax
111
+ moe_router_topk: 2
112
+ moe_router_topk_limited_devices: null
113
+ moe_router_topk_scaling_factor: null
114
+ moe_shared_expert_intermediate_size: null
115
+ moe_shared_expert_overlap: false
116
+ moe_token_dispatcher_type: allgather
117
+ moe_token_drop_policy: probs
118
+ moe_token_dropping: false
119
+ moe_use_legacy_grouped_gemm: false
120
+ moe_z_loss_coeff: null
121
+ multi_latent_attention: false
122
+ no_sync_func: null
123
+ normalization: RMSNorm
124
+ num_attention_heads: 32
125
+ num_layers: 16
126
+ num_layers_in_first_pipeline_stage: null
127
+ num_layers_in_last_pipeline_stage: null
128
+ num_microbatches_with_partial_activation_checkpoints: null
129
+ num_moe_experts: null
130
+ num_query_groups: 8
131
+ old_context_len: 8192
132
+ output_layer_init_method: null
133
+ overlap_p2p_comm: false
134
+ overlap_p2p_comm_warmup_flush: false
135
+ parallel_output: true
136
+ param_sync_func: null
137
+ params_dtype:
138
+ _call_: false
139
+ _target_: torch.bfloat16
140
+ perform_initialization: true
141
+ persist_layer_norm: true
142
+ pipeline_dtype:
143
+ _call_: false
144
+ _target_: torch.bfloat16
145
+ pipeline_model_parallel_size: 1
146
+ pipeline_model_parallel_split_rank: null
147
+ position_embedding_type: rope
148
+ qk_layernorm: false
149
+ recompute_granularity: null
150
+ recompute_method: null
151
+ recompute_num_layers: null
152
+ rotary_base: 500000
153
+ rotary_interleaved: false
154
+ rotary_percent: 1.0
155
+ scale_factor: 32
156
+ scatter_embedding_sequence_parallel: true
157
+ seq_len_interpolation_factor: null
158
+ seq_length: 2048
159
+ sequence_parallel: false
160
+ share_embeddings_and_output_weights: true
161
+ softmax_scale: null
162
+ tensor_model_parallel_size: 1
163
+ test_mode: false
164
+ timers: null
165
+ tp_comm_atomic_ag: false
166
+ tp_comm_atomic_rs: false
167
+ tp_comm_bootstrap_backend: nccl
168
+ tp_comm_bulk_dgrad: true
169
+ tp_comm_bulk_wgrad: true
170
+ tp_comm_overlap: false
171
+ tp_comm_overlap_ag: true
172
+ tp_comm_overlap_disable_fc1: false
173
+ tp_comm_overlap_disable_qkv: false
174
+ tp_comm_overlap_rs: true
175
+ tp_comm_overlap_rs_dgrad: false
176
+ tp_comm_split_ag: true
177
+ tp_comm_split_rs: true
178
+ tp_only_amax_red: false
179
+ transformer_layer_spec:
180
+ _call_: false
181
+ _target_: nemo.collections.llm.gpt.model.base.default_layer_spec
182
+ use_cpu_initialization: false
183
+ use_ring_exchange_p2p: false
184
+ use_te_rng_tracker: false
185
+ use_transformer_engine_full_layer_spec: false
186
+ variable_seq_lengths: false
187
+ virtual_pipeline_model_parallel_size: null
188
+ wgrad_deferral_limit: 0
189
+ window_size: null
190
+ model_transform: null
191
+ optim:
192
+ _target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
193
+ config:
194
+ _target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
195
+ adam_beta1: 0.9
196
+ adam_beta2: 0.95
197
+ adam_eps: 1.0e-05
198
+ barrier_with_L1_time: false
199
+ bf16: true
200
+ clip_grad: 1.0
201
+ config_logger_dir: ''
202
+ decoupled_lr: null
203
+ decoupled_min_lr: null
204
+ exp_avg_dtype:
205
+ _call_: false
206
+ _target_: torch.float32
207
+ exp_avg_sq_dtype:
208
+ _call_: false
209
+ _target_: torch.float32
210
+ fp16: false
211
+ hysteresis: 2
212
+ initial_loss_scale: 4294967296
213
+ log_num_zeros_in_grad: false
214
+ loss_scale: null
215
+ loss_scale_window: 1000
216
+ lr: 0.0003
217
+ main_grads_dtype:
218
+ _call_: false
219
+ _target_: torch.float32
220
+ main_params_dtype:
221
+ _call_: false
222
+ _target_: torch.float32
223
+ min_loss_scale: 1.0
224
+ min_lr: null
225
+ optimizer: adam
226
+ overlap_param_gather_with_optimizer_step: false
227
+ params_dtype:
228
+ _call_: false
229
+ _target_: torch.float32
230
+ sgd_momentum: 0.9
231
+ timers: null
232
+ use_distributed_optimizer: true
233
+ use_precision_aware_optimizer: false
234
+ weight_decay: 0.1
235
+ lr_mult: 1.0
236
+ lr_scheduler:
237
+ _target_: nemo.lightning.pytorch.optim.lr_scheduler.CosineAnnealingScheduler
238
+ constant_steps: 0
239
+ frequency: 1
240
+ interval: step
241
+ max_steps: 10
242
+ min_lr: 2.9999999999999997e-05
243
+ monitor: val_loss
244
+ warmup_steps: 2000
245
+ no_weight_decay_cond: null
246
+ scale_lr_cond: null
247
+ tokenizer:
248
+ _target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
249
+ additional_special_tokens: []
250
+ bos_token: null
251
+ cls_token: null
252
+ eos_token: null
253
+ include_special_tokens: false
254
+ mask_token: null
255
+ merges_file: null
256
+ pad_token: null
257
+ pretrained_model_name:
258
+ _target_: nemo.lightning.io.artifact.file.DirOrStringArtifact
259
+ attr: allenai/OLMo-1B-hf
260
+ required: true
261
+ skip: true
262
+ sep_token: null
263
+ trust_remote_code: false
264
+ unk_token: null
265
+ use_fast: true
266
+ vocab_file: tokenizer_config.json
model_name=0--step=799-consumed_samples=409600.0/context/tokenizer_config.json ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "|||IP_ADDRESS|||",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": false
13
+ },
14
+ "1": {
15
+ "content": "<|padding|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "50254": {
23
+ "content": " ",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "50255": {
31
+ "content": " ",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "50256": {
39
+ "content": " ",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "50257": {
47
+ "content": " ",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "50258": {
55
+ "content": " ",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "50259": {
63
+ "content": " ",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "50260": {
71
+ "content": " ",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "50261": {
79
+ "content": " ",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "50262": {
87
+ "content": " ",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "50263": {
95
+ "content": " ",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "50264": {
103
+ "content": " ",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "50265": {
111
+ "content": " ",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "50266": {
119
+ "content": " ",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "50267": {
127
+ "content": " ",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "50268": {
135
+ "content": " ",
136
+ "lstrip": false,
137
+ "normalized": true,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "50269": {
143
+ "content": " ",
144
+ "lstrip": false,
145
+ "normalized": true,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "50270": {
151
+ "content": " ",
152
+ "lstrip": false,
153
+ "normalized": true,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "50271": {
159
+ "content": " ",
160
+ "lstrip": false,
161
+ "normalized": true,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "50272": {
167
+ "content": " ",
168
+ "lstrip": false,
169
+ "normalized": true,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "50273": {
175
+ "content": " ",
176
+ "lstrip": false,
177
+ "normalized": true,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "50274": {
183
+ "content": " ",
184
+ "lstrip": false,
185
+ "normalized": true,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "50275": {
191
+ "content": " ",
192
+ "lstrip": false,
193
+ "normalized": true,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "50276": {
199
+ "content": " ",
200
+ "lstrip": false,
201
+ "normalized": true,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "50277": {
207
+ "content": "|||EMAIL_ADDRESS|||",
208
+ "lstrip": false,
209
+ "normalized": true,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "50278": {
215
+ "content": "|||PHONE_NUMBER|||",
216
+ "lstrip": false,
217
+ "normalized": true,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": false
221
+ },
222
+ "50279": {
223
+ "content": "<|endoftext|>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ }
230
+ },
231
+ "bos_token": null,
232
+ "clean_up_tokenization_spaces": true,
233
+ "eos_token": "<|endoftext|>",
234
+ "model_max_length": 1000000000000000019884624838656,
235
+ "pad_token": "<|padding|>",
236
+ "tokenizer_class": "GPTNeoXTokenizer",
237
+ "unk_token": null
238
+ }
model_name=0--step=799-consumed_samples=409600.0/weights/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cd170e22dcdc0e4dae4c45ff81c24859659ca5ab0915d9fd06476242ea52f02
3
+ size 272079
model_name=0--step=799-consumed_samples=409600.0/weights/__3_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b0836e178ac26012a9bc800de155998308a9ac23b20df194e1dc86781ea4541
3
+ size 943954152
model_name=0--step=799-consumed_samples=409600.0/weights/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbe27f16a4fd3f793a5b38a2b9fe5837ee19725dc9638d9bbc832c3f26cb0d18
3
+ size 938826468
model_name=0--step=799-consumed_samples=409600.0/weights/common.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1507135bcb64e40b5d7df4f67ab2dd3bdcefa92daf80b320eadba071dca160e8
3
+ size 7919
model_name=0--step=799-consumed_samples=409600.0/weights/metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
model_name=0--step=899-consumed_samples=460800.0/weights/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cd6a46a8766b9ac4227157bbe9932ff6ac2a71899b1d8a558c2b8c86a6e531c
3
+ size 938897288
model_name=0--step=899-consumed_samples=460800.0/weights/__0_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9880badd62e5f7a2f9ea861d47e92fb574249cafb03d77d0544a06b065b75ac
3
+ size 940460984
model_name=0--step=899-consumed_samples=460800.0/weights/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26305970ccf73ab2a2df00769cf902178dc3b31fd20b03aae8937dd5dbfe00da
3
+ size 938851248
model_name=0--step=899-consumed_samples=460800.0/weights/__1_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9651d190e1baeb7b31b9ef223751e75f6fc468755328059437b0609287e7e28e
3
+ size 943962344
model_name=0--step=899-consumed_samples=460800.0/weights/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed6fcf3fc5a9a8138b2bc5904d3ab5ea85552518b7e19a9c8925c92e40002c7
3
+ size 937781988
model_name=0--step=899-consumed_samples=460800.0/weights/__2_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c52dfc09030c7cd3aa32d21419911657f2a5bed9aa46bde849e5e911d026e544
3
+ size 944982044
model_name=0--step=899-consumed_samples=460800.0/weights/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcbe3d9e69dc77393adacabd5be941f5f23464edeb4f160002ac292e9613b733
3
+ size 943054924
model_name=0--step=899-consumed_samples=460800.0/weights/__3_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e626b9081f284dd20394372a8d975e7936e262093db74975ff69b3e55f19238
3
+ size 943954152
model_name=0--step=899-consumed_samples=460800.0/weights/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5514bf70fe29ab9ce61fbb9174817756c67f92fdbfb377d34ed63b8bf82f821b
3
+ size 941969992
model_name=0--step=899-consumed_samples=460800.0/weights/__4_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69558b755c58f6f200edcd006cfb8bceef13b27bd94b5c5be1378cacc3c569b5
3
+ size 944985984
model_name=0--step=899-consumed_samples=460800.0/weights/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a549eb0b75e2dd4a6228cc0216aa8abda3c498c9b8a06bda95e2b4e7af083ad
3
+ size 941995240
model_name=0--step=899-consumed_samples=460800.0/weights/__5_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23bd28dd9793edcb367b4824921619b01b4f2a4590d4e976db18331c78c9fe1d
3
+ size 945017376
model_name=0--step=899-consumed_samples=460800.0/weights/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:049e48f31147d3010fc42fe28488d3dfaa30d1bfb1b4776b09fa26f8499805b3
3
+ size 941969992
model_name=0--step=899-consumed_samples=460800.0/weights/__6_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a5c3747975f2b0afe57c27f6b11fb6b0b718d79a1c6deb4a1f8471f977ce93b
3
+ size 936770304
model_name=0--step=899-consumed_samples=460800.0/weights/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bb0a9b6b93f31a8a6a33e11aca04fda52b278f5a0b1af5774f0741b6f12b2dd
3
+ size 938826468