aflah
/

llama32_1b_dclm-SL-2048-PGBS-16-GAS-4-NGPU-8-NNODES-1-TW-PERF

Model card Files Files and versions Community

aflah commited on Mar 4

Commit

0143de1

verified ·

1 Parent(s): 50732c4

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -0
model_name=0--step=1274-consumed_samples=652800.0-last/weights/.metadata +3 -0
model_name=0--step=1274-consumed_samples=652800.0-last/weights/__2_0.distcp +3 -0
model_name=0--step=1274-consumed_samples=652800.0-last/weights/__2_1.distcp +3 -0
model_name=0--step=1274-consumed_samples=652800.0-last/weights/__5_0.distcp +3 -0
model_name=0--step=1274-consumed_samples=652800.0-last/weights/__6_1.distcp +3 -0
model_name=0--step=1274-consumed_samples=652800.0-last/weights/__7_0.distcp +3 -0
model_name=0--step=1274-consumed_samples=652800.0-last/weights/metadata.json +1 -0
model_name=0--step=1299-consumed_samples=665600.0-last/context/2049c765-98e2-41f3-a848-87d6ff71200d +0 -0
model_name=0--step=1299-consumed_samples=665600.0-last/context/22af8890-6558-45e2-a197-992aeb69982c +0 -0
model_name=0--step=1299-consumed_samples=665600.0-last/context/aa186669-4cbe-4e9a-b800-a88c74e5c229 +0 -0
model_name=0--step=1299-consumed_samples=665600.0-last/context/io.json +1 -0
model_name=0--step=1299-consumed_samples=665600.0-last/context/model.yaml +266 -0
model_name=0--step=1299-consumed_samples=665600.0-last/context/tokenizer_config.json +238 -0
model_name=0--step=1299-consumed_samples=665600.0-last/weights/__1_0.distcp +3 -0
model_name=0--step=1299-consumed_samples=665600.0-last/weights/__3_1.distcp +3 -0
model_name=0--step=1299-consumed_samples=665600.0-last/weights/__4_1.distcp +3 -0
model_name=0--step=1299-consumed_samples=665600.0-last/weights/__7_0.distcp +3 -0
model_name=0--step=299-consumed_samples=153600.0/context/1c2c7859-77a9-4039-bffd-0ebd68c1fc94 +0 -0
model_name=0--step=299-consumed_samples=153600.0/context/684b8564-68fa-41fa-9cdc-bbdf4eba2314 +0 -0
model_name=0--step=299-consumed_samples=153600.0/context/8ceb9fce-96b9-44bd-8e0f-96e8cbc13ea4 +0 -0
model_name=0--step=299-consumed_samples=153600.0/context/io.json +1 -0
model_name=0--step=299-consumed_samples=153600.0/context/model.yaml +266 -0
model_name=0--step=299-consumed_samples=153600.0/context/tokenizer_config.json +238 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__0_0.distcp +3 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__0_1.distcp +3 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__1_0.distcp +3 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__2_0.distcp +3 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__2_1.distcp +3 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__3_1.distcp +3 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__4_1.distcp +3 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__5_0.distcp +3 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__5_1.distcp +3 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__6_0.distcp +3 -0
model_name=0--step=399-consumed_samples=204800.0/weights/__7_0.distcp +3 -0
model_name=0--step=499-consumed_samples=256000.0/weights/__3_0.distcp +3 -0
model_name=0--step=499-consumed_samples=256000.0/weights/__5_0.distcp +3 -0
model_name=0--step=499-consumed_samples=256000.0/weights/__7_0.distcp +3 -0
model_name=0--step=599-consumed_samples=307200.0/weights/__5_0.distcp +3 -0
model_name=0--step=599-consumed_samples=307200.0/weights/__7_1.distcp +3 -0
model_name=0--step=699-consumed_samples=358400.0/weights/__7_1.distcp +3 -0
model_name=0--step=799-consumed_samples=409600.0/weights/__0_1.distcp +3 -0
model_name=0--step=799-consumed_samples=409600.0/weights/__3_0.distcp +3 -0
model_name=0--step=799-consumed_samples=409600.0/weights/__4_0.distcp +3 -0
model_name=0--step=799-consumed_samples=409600.0/weights/__6_0.distcp +3 -0
model_name=0--step=799-consumed_samples=409600.0/weights/__7_1.distcp +3 -0
model_name=0--step=999-consumed_samples=512000.0/weights/__0_1.distcp +3 -0
model_name=0--step=999-consumed_samples=512000.0/weights/__1_1.distcp +3 -0
model_name=0--step=999-consumed_samples=512000.0/weights/__2_0.distcp +3 -0
model_name=0--step=999-consumed_samples=512000.0/weights/__5_0.distcp +3 -0

.gitattributes CHANGED Viewed

@@ -161,3 +161,40 @@ model_name=0--step=999-consumed_samples=512000.0/weights/__7_1.distcp filter=lfs
 model_name=0--step=999-consumed_samples=512000.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
 model_name=0--step=799-consumed_samples=409600.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
 model_name=0--step=799-consumed_samples=409600.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text

 model_name=0--step=999-consumed_samples=512000.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
 model_name=0--step=799-consumed_samples=409600.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
 model_name=0--step=799-consumed_samples=409600.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=799-consumed_samples=409600.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=799-consumed_samples=409600.0/weights/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=799-consumed_samples=409600.0/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=799-consumed_samples=409600.0/weights/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=999-consumed_samples=512000.0/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=999-consumed_samples=512000.0/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=999-consumed_samples=512000.0/weights/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=999-consumed_samples=512000.0/weights/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=999-consumed_samples=512000.0/weights/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=1274-consumed_samples=652800.0-last/weights/.metadata filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=599-consumed_samples=307200.0/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=499-consumed_samples=256000.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=499-consumed_samples=256000.0/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=499-consumed_samples=256000.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=599-consumed_samples=307200.0/weights/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=399-consumed_samples=204800.0/weights/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=1274-consumed_samples=652800.0-last/weights/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=1274-consumed_samples=652800.0-last/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=1299-consumed_samples=665600.0-last/weights/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=1299-consumed_samples=665600.0-last/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=699-consumed_samples=358400.0/weights/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=1299-consumed_samples=665600.0-last/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=1299-consumed_samples=665600.0-last/weights/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=1274-consumed_samples=652800.0-last/weights/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=1274-consumed_samples=652800.0-last/weights/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=1274-consumed_samples=652800.0-last/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
+model_name=0--step=799-consumed_samples=409600.0/weights/__6_0.distcp filter=lfs diff=lfs merge=lfs -text

model_name=0--step=1274-consumed_samples=652800.0-last/weights/.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fb2498d72af3d35105d036a4e17e9f5861f8686709c149842edb7e64260136c
+size 272085

model_name=0--step=1274-consumed_samples=652800.0-last/weights/__2_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a7c23488bc9f9db8044a9695d11046963da3eb917fa7ac1540edf7c30a6fb3b
+size 937781988

model_name=0--step=1274-consumed_samples=652800.0-last/weights/__2_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60953b7e518dac93983c266caf4a4fd5c5a44884d2c0fcb110cf039afd3a94f8
+size 944982044

model_name=0--step=1274-consumed_samples=652800.0-last/weights/__5_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76a8a40858beac52db43d443b055b825df4384ac52c57ab9149f5b6f606ddc2f
+size 941995240

model_name=0--step=1274-consumed_samples=652800.0-last/weights/__6_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:862a41260f0b0780d8390ec81df037761edb5e83c474c8f762b837f62fa765bb
+size 936770304

model_name=0--step=1274-consumed_samples=652800.0-last/weights/__7_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da040d7532aeac5567b95d613590b41547be820cd8d74e1145335a324cdff7fe
+size 938826468

model_name=0--step=1274-consumed_samples=652800.0-last/weights/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}

model_name=0--step=1299-consumed_samples=665600.0-last/context/2049c765-98e2-41f3-a848-87d6ff71200d ADDED Viewed

Binary file (173 Bytes). View file

model_name=0--step=1299-consumed_samples=665600.0-last/context/22af8890-6558-45e2-a197-992aeb69982c ADDED Viewed

Binary file (584 Bytes). View file

model_name=0--step=1299-consumed_samples=665600.0-last/context/aa186669-4cbe-4e9a-b800-a88c74e5c229 ADDED Viewed

Binary file (202 Bytes). View file

model_name=0--step=1299-consumed_samples=665600.0-last/context/io.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"root": {"type": "ref", "key": "trainer_context_1"}, "objects": {"tuple_1": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "sequence_parallel"], ["Index(index=4)", "context_parallel_size"], ["Index(index=5)", "expert_model_parallel_size"], ["Index(index=6)", "expert_tensor_parallel_size"], ["Index(index=7)", "moe_extended_tp"], ["Index(index=8)", "bf16"], ["Index(index=9)", "params_dtype"], ["Index(index=10)", "autocast_dtype"], ["Index(index=11)", "use_te_rng_tracker"], ["Index(index=12)", "pipeline_dtype"], ["Index(index=13)", "microbatch_group_size_per_vp_stage"], ["Index(index=14)", "account_for_embedding_in_pipeline_split"], ["Index(index=15)", "account_for_loss_in_pipeline_split"], ["Index(index=16)", "share_embeddings_and_output_weights"], ["Index(index=17)", "seq_length"]], "metadata": null}, "dict_1": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_2": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "Llama32Config1B"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_1"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_1"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_2"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama32_config1_b_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.virtual_pipeline_model_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.sequence_parallel"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.context_parallel_size"]}], ["Attr(name='expert_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.expert_model_parallel_size"]}], ["Attr(name='expert_tensor_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.expert_tensor_parallel_size"]}], ["Attr(name='moe_extended_tp')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.moe_extended_tp"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='autocast_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='use_te_rng_tracker')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.use_te_rng_tracker"]}], ["Attr(name='pipeline_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='microbatch_group_size_per_vp_stage')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.microbatch_group_size_per_vp_stage"]}], ["Attr(name='account_for_embedding_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_embedding_in_pipeline_split"]}], ["Attr(name='account_for_loss_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_loss_in_pipeline_split"]}], ["Attr(name='share_embeddings_and_output_weights')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.share_embeddings_and_output_weights"]}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.model.config.seq_length"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_1"}, "paths": ["<root>.model.config"]}, "tuple_2": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "optimizer"], ["Index(index=1)", "lr"], ["Index(index=2)", "min_lr"], ["Index(index=3)", "decoupled_lr"], ["Index(index=4)", "decoupled_min_lr"], ["Index(index=5)", "weight_decay"], ["Index(index=6)", "fp16"], ["Index(index=7)", "bf16"], ["Index(index=8)", "params_dtype"], ["Index(index=9)", "use_precision_aware_optimizer"], ["Index(index=10)", "main_grads_dtype"], ["Index(index=11)", "main_params_dtype"], ["Index(index=12)", "exp_avg_dtype"], ["Index(index=13)", "exp_avg_sq_dtype"], ["Index(index=14)", "loss_scale"], ["Index(index=15)", "initial_loss_scale"], ["Index(index=16)", "min_loss_scale"], ["Index(index=17)", "loss_scale_window"], ["Index(index=18)", "hysteresis"], ["Index(index=19)", "adam_beta1"], ["Index(index=20)", "adam_beta2"], ["Index(index=21)", "adam_eps"], ["Index(index=22)", "sgd_momentum"], ["Index(index=23)", "use_distributed_optimizer"], ["Index(index=24)", "overlap_param_gather_with_optimizer_step"], ["Index(index=25)", "clip_grad"], ["Index(index=26)", "log_num_zeros_in_grad"], ["Index(index=27)", "barrier_with_L1_time"], ["Index(index=28)", "timers"], ["Index(index=29)", "config_logger_dir"]], "metadata": null}, "dict_3": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_4": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.optimizer.optimizer_config", "name": "OptimizerConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_2"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_3"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_4"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "optimizer_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='optimizer')", {"type": "leaf", "value": "adam", "paths": ["<root>.model.optim.config.optimizer"]}], ["Attr(name='lr')", {"type": "leaf", "value": 0.0003, "paths": ["<root>.model.optim.config.lr"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.min_lr"]}], ["Attr(name='decoupled_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_lr"]}], ["Attr(name='decoupled_min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_min_lr"]}], ["Attr(name='weight_decay')", {"type": "leaf", "value": 0.1, "paths": ["<root>.model.optim.config.weight_decay"]}], ["Attr(name='fp16')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.fp16"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='use_precision_aware_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.use_precision_aware_optimizer"]}], ["Attr(name='main_grads_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='main_params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_sq_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='loss_scale')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.loss_scale"]}], ["Attr(name='initial_loss_scale')", {"type": "leaf", "value": 4294967296, "paths": ["<root>.model.optim.config.initial_loss_scale"]}], ["Attr(name='min_loss_scale')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.min_loss_scale"]}], ["Attr(name='loss_scale_window')", {"type": "leaf", "value": 1000, "paths": ["<root>.model.optim.config.loss_scale_window"]}], ["Attr(name='hysteresis')", {"type": "leaf", "value": 2, "paths": ["<root>.model.optim.config.hysteresis"]}], ["Attr(name='adam_beta1')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.adam_beta1"]}], ["Attr(name='adam_beta2')", {"type": "leaf", "value": 0.95, "paths": ["<root>.model.optim.config.adam_beta2"]}], ["Attr(name='adam_eps')", {"type": "leaf", "value": 1e-05, "paths": ["<root>.model.optim.config.adam_eps"]}], ["Attr(name='sgd_momentum')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.sgd_momentum"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.use_distributed_optimizer"]}], ["Attr(name='overlap_param_gather_with_optimizer_step')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.overlap_param_gather_with_optimizer_step"]}], ["Attr(name='clip_grad')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.clip_grad"]}], ["Attr(name='log_num_zeros_in_grad')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.log_num_zeros_in_grad"]}], ["Attr(name='barrier_with_L1_time')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.barrier_with_L1_time"]}], ["Attr(name='timers')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.timers"]}], ["Attr(name='config_logger_dir')", {"type": "leaf", "value": "", "paths": ["<root>.model.optim.config.config_logger_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_2"}, "paths": ["<root>.model.optim.config"]}, "tuple_3": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "warmup_steps"], ["Index(index=1)", "constant_steps"], ["Index(index=2)", "min_lr"]], "metadata": null}, "dict_5": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_6": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_3": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.lr_scheduler", "name": "CosineAnnealingScheduler"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_3"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_5"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_6"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "cosine_annealing_scheduler_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='warmup_steps')", {"type": "leaf", "value": 2000, "paths": ["<root>.model.optim.lr_scheduler.warmup_steps"]}], ["Attr(name='constant_steps')", {"type": "leaf", "value": 0, "paths": ["<root>.model.optim.lr_scheduler.constant_steps"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": 2.9999999999999997e-05, "paths": ["<root>.model.optim.lr_scheduler.min_lr"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_3"}, "paths": ["<root>.model.optim.lr_scheduler"]}, "tuple_4": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "lr_scheduler"]], "metadata": null}, "dict_7": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_8": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_4": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.megatron", "name": "MegatronOptimizerModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_4"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_7"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_8"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_optimizer_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "optimizer_config_1"}], ["Attr(name='lr_scheduler')", {"type": "ref", "key": "cosine_annealing_scheduler_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_4"}, "paths": ["<root>.model.optim"]}, "tuple_5": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "attr"], ["Index(index=1)", "skip"]], "metadata": null}, "dict_9": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_10": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_5": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.artifact.file", "name": "DirOrStringArtifact"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_5"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_9"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_10"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "dir_or_string_artifact_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='attr')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.model.tokenizer.pretrained_model_name.attr"]}], ["Attr(name='skip')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.pretrained_model_name.skip"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_5"}, "paths": ["<root>.model.tokenizer.pretrained_model_name"]}, "tuple_6": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_11": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_12": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_6": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_6"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_11"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_12"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "ref", "key": "dir_or_string_artifact_1"}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "tokenizer_config.json", "paths": ["<root>.model.tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_6"}, "paths": ["<root>.model.tokenizer"]}, "tuple_7": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "optim"], ["Index(index=2)", "tokenizer"]], "metadata": null}, "dict_13": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_14": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_7": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "LlamaModel"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_7"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_13"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_14"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama_model_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "llama32_config1_b_1"}], ["Attr(name='optim')", {"type": "ref", "key": "megatron_optimizer_module_1"}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_7"}, "paths": ["<root>.model"]}, "tuple_8": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "grad_reduce_in_fp32"], ["Index(index=1)", "overlap_grad_reduce"], ["Index(index=2)", "overlap_param_gather"], ["Index(index=3)", "align_param_gather"], ["Index(index=4)", "use_distributed_optimizer"], ["Index(index=5)", "num_distributed_optimizer_instances"], ["Index(index=6)", "check_for_nan_in_grad"], ["Index(index=7)", "bucket_size"], ["Index(index=8)", "average_in_collective"], ["Index(index=9)", "fp8_param_gather"]], "metadata": null}, "dict_15": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_16": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_8": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.distributed.distributed_data_parallel_config", "name": "DistributedDataParallelConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_8"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_15"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_16"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "distributed_data_parallel_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='grad_reduce_in_fp32')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.grad_reduce_in_fp32"]}], ["Attr(name='overlap_grad_reduce')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_grad_reduce"]}], ["Attr(name='overlap_param_gather')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_param_gather"]}], ["Attr(name='align_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.align_param_gather"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.use_distributed_optimizer"]}], ["Attr(name='num_distributed_optimizer_instances')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.ddp.num_distributed_optimizer_instances"]}], ["Attr(name='check_for_nan_in_grad')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.check_for_nan_in_grad"]}], ["Attr(name='bucket_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.ddp.bucket_size"]}], ["Attr(name='average_in_collective')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.average_in_collective"]}], ["Attr(name='fp8_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.fp8_param_gather"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_8"}, "paths": ["<root>.trainer.strategy.ddp"]}, "tuple_9": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "gradient_as_bucket_view"]], "metadata": null}, "dict_17": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='gradient_as_bucket_view')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.kwargs['gradient_as_bucket_view']"]}]], "metadata": {"type": "ref", "key": "tuple_9"}, "paths": ["<root>.trainer.strategy.kwargs"]}, "tuple_10": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "context_parallel_size"], ["Index(index=4)", "sequence_parallel"], ["Index(index=5)", "ddp"], ["Index(index=6)", "pipeline_dtype"], ["Index(index=7)", "ckpt_async_save"], ["Index(index=8)", "ckpt_parallel_load"], ["Index(index=9)", "kwargs"]], "metadata": null}, "dict_18": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_19": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_9": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.strategies.megatron_strategy", "name": "MegatronStrategy"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_10"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_18"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_19"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_strategy_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.virtual_pipeline_model_parallel_size"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.context_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.sequence_parallel"]}], ["Attr(name='ddp')", {"type": "ref", "key": "distributed_data_parallel_config_1"}], ["Attr(name='pipeline_dtype')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.pipeline_dtype"]}], ["Attr(name='ckpt_async_save')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_async_save"]}], ["Attr(name='ckpt_parallel_load')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_parallel_load"]}], ["Attr(name='kwargs')", {"type": "ref", "key": "dict_17"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_9"}, "paths": ["<root>.trainer.strategy"]}, "timing_callback_1": {"type": {"type": "pyref", "module": "nemo.utils.exp_manager", "name": "TimingCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "aa186669-4cbe-4e9a-b800-a88c74e5c229", "paths": ["<root>.trainer.callbacks[0]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[0]"]}, "garbage_collection_callback_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.callbacks.garbage_collection", "name": "GarbageCollectionCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "2049c765-98e2-41f3-a848-87d6ff71200d", "paths": ["<root>.trainer.callbacks[1]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[1]"]}, "list_1": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "ref", "key": "timing_callback_1"}], ["Index(index=1)", {"type": "ref", "key": "garbage_collection_callback_1"}]], "metadata": null, "paths": ["<root>.trainer.callbacks"]}, "megatron_mixed_precision_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.plugins.mixed_precision", "name": "MegatronMixedPrecision"}, "items": [["IdentityElement()", {"type": "leaf", "value": "22af8890-6558-45e2-a197-992aeb69982c", "paths": ["<root>.trainer.plugins"]}]], "metadata": null, "paths": ["<root>.trainer.plugins"]}, "tuple_11": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "accelerator"], ["Index(index=1)", "strategy"], ["Index(index=2)", "devices"], ["Index(index=3)", "num_nodes"], ["Index(index=4)", "callbacks"], ["Index(index=5)", "max_steps"], ["Index(index=6)", "limit_val_batches"], ["Index(index=7)", "val_check_interval"], ["Index(index=8)", "log_every_n_steps"], ["Index(index=9)", "accumulate_grad_batches"], ["Index(index=10)", "use_distributed_sampler"], ["Index(index=11)", "plugins"]], "metadata": null}, "dict_20": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_21": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_10": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.trainer", "name": "Trainer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_11"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_20"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_21"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='accelerator')", {"type": "leaf", "value": "gpu", "paths": ["<root>.trainer.accelerator"]}], ["Attr(name='strategy')", {"type": "ref", "key": "megatron_strategy_1"}], ["Attr(name='devices')", {"type": "leaf", "value": 8, "paths": ["<root>.trainer.devices"]}], ["Attr(name='num_nodes')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.num_nodes"]}], ["Attr(name='callbacks')", {"type": "ref", "key": "list_1"}], ["Attr(name='max_steps')", {"type": "leaf", "value": 1168251, "paths": ["<root>.trainer.max_steps"]}], ["Attr(name='limit_val_batches')", {"type": "leaf", "value": 32, "paths": ["<root>.trainer.limit_val_batches"]}], ["Attr(name='val_check_interval')", {"type": "leaf", "value": 100, "paths": ["<root>.trainer.val_check_interval"]}], ["Attr(name='log_every_n_steps')", {"type": "leaf", "value": 10, "paths": ["<root>.trainer.log_every_n_steps"]}], ["Attr(name='accumulate_grad_batches')", {"type": "leaf", "value": 4, "paths": ["<root>.trainer.accumulate_grad_batches"]}], ["Attr(name='use_distributed_sampler')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.use_distributed_sampler"]}], ["Attr(name='plugins')", {"type": "ref", "key": "megatron_mixed_precision_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_10"}, "paths": ["<root>.trainer"]}, "list_2": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "leaf", "value": "Data/dclm_local_shard_1_megatron/concatenated.jsonl_text_document", "paths": ["<root>.extra['datamodule'].paths[0]"]}]], "metadata": null, "paths": ["<root>.extra['datamodule'].paths"]}, "tuple_12": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_22": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_23": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_11": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_12"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_22"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_23"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.extra['datamodule'].tokenizer.pretrained_model_name"]}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "Data/tokenizer/tokenizer_config.json", "paths": ["<root>.extra['datamodule'].tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.extra['datamodule'].tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_11"}, "paths": ["<root>.extra['datamodule'].tokenizer"]}, "tuple_13": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "paths"], ["Index(index=1)", "seq_length"], ["Index(index=2)", "tokenizer"], ["Index(index=3)", "micro_batch_size"], ["Index(index=4)", "global_batch_size"], ["Index(index=5)", "split"], ["Index(index=6)", "index_mapping_dir"]], "metadata": null}, "dict_24": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_25": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_12": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.data.pre_training", "name": "PreTrainingDataModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_13"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_24"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_25"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "pre_training_data_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='paths')", {"type": "ref", "key": "list_2"}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.extra['datamodule'].seq_length"]}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_2"}], ["Attr(name='micro_batch_size')", {"type": "leaf", "value": 16, "paths": ["<root>.extra['datamodule'].micro_batch_size"]}], ["Attr(name='global_batch_size')", {"type": "leaf", "value": 512, "paths": ["<root>.extra['datamodule'].global_batch_size"]}], ["Attr(name='split')", {"type": "leaf", "value": "99,8,2", "paths": ["<root>.extra['datamodule'].split"]}], ["Attr(name='index_mapping_dir')", {"type": "leaf", "value": "Data/index_mapping_local_shard_1", "paths": ["<root>.extra['datamodule'].index_mapping_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_12"}, "paths": ["<root>.extra['datamodule']"]}, "tuple_14": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "datamodule"]], "metadata": null}, "dict_26": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='datamodule')", {"type": "ref", "key": "pre_training_data_module_1"}]], "metadata": {"type": "ref", "key": "tuple_14"}, "paths": ["<root>.extra"]}, "tuple_15": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "model"], ["Index(index=1)", "trainer"], ["Index(index=2)", "extra"]], "metadata": null}, "dict_27": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_28": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_13": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.pl", "name": "TrainerContext"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_15"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_27"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_28"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_context_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='model')", {"type": "ref", "key": "llama_model_1"}], ["Attr(name='trainer')", {"type": "ref", "key": "trainer_1"}], ["Attr(name='extra')", {"type": "ref", "key": "dict_26"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_13"}, "paths": ["<root>"]}}, "refcounts": {"tuple_1": 1, "dict_1": 1, "dict_2": 1, "buildable_traverser_metadata_1": 1, "llama32_config1_b_1": 1, "tuple_2": 1, "dict_3": 1, "dict_4": 1, "buildable_traverser_metadata_2": 1, "optimizer_config_1": 1, "tuple_3": 1, "dict_5": 1, "dict_6": 1, "buildable_traverser_metadata_3": 1, "cosine_annealing_scheduler_1": 1, "tuple_4": 1, "dict_7": 1, "dict_8": 1, "buildable_traverser_metadata_4": 1, "megatron_optimizer_module_1": 1, "tuple_5": 1, "dict_9": 1, "dict_10": 1, "buildable_traverser_metadata_5": 1, "dir_or_string_artifact_1": 1, "tuple_6": 1, "dict_11": 1, "dict_12": 1, "buildable_traverser_metadata_6": 1, "auto_tokenizer_1": 1, "tuple_7": 1, "dict_13": 1, "dict_14": 1, "buildable_traverser_metadata_7": 1, "llama_model_1": 1, "tuple_8": 1, "dict_15": 1, "dict_16": 1, "buildable_traverser_metadata_8": 1, "distributed_data_parallel_config_1": 1, "tuple_9": 1, "dict_17": 1, "tuple_10": 1, "dict_18": 1, "dict_19": 1, "buildable_traverser_metadata_9": 1, "megatron_strategy_1": 1, "timing_callback_1": 1, "garbage_collection_callback_1": 1, "list_1": 1, "megatron_mixed_precision_1": 1, "tuple_11": 1, "dict_20": 1, "dict_21": 1, "buildable_traverser_metadata_10": 1, "trainer_1": 1, "list_2": 1, "tuple_12": 1, "dict_22": 1, "dict_23": 1, "buildable_traverser_metadata_11": 1, "auto_tokenizer_2": 1, "tuple_13": 1, "dict_24": 1, "dict_25": 1, "buildable_traverser_metadata_12": 1, "pre_training_data_module_1": 1, "tuple_14": 1, "dict_26": 1, "tuple_15": 1, "dict_27": 1, "dict_28": 1, "buildable_traverser_metadata_13": 1, "trainer_context_1": 1}, "version": "0.0.1"}

model_name=0--step=1299-consumed_samples=665600.0-last/context/model.yaml ADDED Viewed

	@@ -0,0 +1,266 @@

+_target_: nemo.collections.llm.gpt.model.llama.LlamaModel
+config:
+  _cpu_offloading_context: null
+  _target_: nemo.collections.llm.gpt.model.llama.Llama32Config1B
+  account_for_embedding_in_pipeline_split: false
+  account_for_loss_in_pipeline_split: false
+  activation_func:
+    _call_: false
+    _target_: torch.nn.functional.silu
+  activation_func_fp8_input_store: false
+  add_bias_linear: false
+  add_qkv_bias: false
+  apply_query_key_layer_scaling: false
+  apply_residual_connection_post_layernorm: false
+  apply_rope_fusion: true
+  async_tensor_model_parallel_allreduce: false
+  attention_backend:
+    _call_: true
+    _target_: megatron.core.transformer.enums.AttnBackend
+  attention_dropout: 0.0
+  attention_softmax_in_fp32: false
+  autocast_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  barrier_with_L1_time: true
+  batch_p2p_comm: true
+  batch_p2p_sync: true
+  bf16: true
+  bias_activation_fusion: true
+  bias_dropout_fusion: true
+  calculate_per_token_loss: false
+  clone_scatter_output_in_embedding: true
+  config_logger_dir: ''
+  context_parallel_size: 1
+  cp_comm_type: null
+  cpu_offloading: false
+  cpu_offloading_activations: true
+  cpu_offloading_num_layers: 0
+  cpu_offloading_weights: true
+  cross_entropy_loss_fusion: true
+  cuda_graph_retain_backward_graph: false
+  cuda_graph_use_single_mempool: false
+  cuda_graph_warmup_steps: 3
+  data_step_fn:
+    _call_: false
+    _target_: nemo.collections.llm.gpt.model.base.gpt_data_step
+  deallocate_pipeline_outputs: true
+  defer_embedding_wgrad_compute: false
+  deterministic_mode: false
+  disable_parameter_transpose_cache: false
+  distribute_saved_activations: null
+  enable_autocast: false
+  enable_cuda_graph: false
+  expert_model_parallel_size: 1
+  expert_tensor_parallel_size: null
+  external_cuda_graph: false
+  ffn_hidden_size: 8192
+  finalize_model_grads_func: null
+  flash_decode: false
+  forward_step_fn:
+    _call_: false
+    _target_: nemo.collections.llm.gpt.model.base.gpt_forward_step
+  fp16: false
+  fp16_lm_cross_entropy: false
+  fp32_residual_connection: false
+  fp8: null
+  fp8_amax_compute_algo: most_recent
+  fp8_amax_history_len: 1
+  fp8_dot_product_attention: false
+  fp8_interval: 1
+  fp8_margin: 0
+  fp8_multi_head_attention: false
+  fp8_wgrad: true
+  gated_linear_unit: true
+  grad_scale_func: null
+  grad_sync_func: null
+  gradient_accumulation_fusion: true
+  hidden_dropout: 0.0
+  hidden_size: 2048
+  hierarchical_context_parallel_sizes: null
+  high_freq_factor: 4
+  inference_rng_tracker: false
+  init_method: null
+  init_method_std: 0.02
+  kv_channels: null
+  layernorm_epsilon: 1.0e-05
+  layernorm_zero_centered_gamma: false
+  low_freq_factor: 1
+  make_vocab_size_divisible_by: 128
+  masked_softmax_fusion: true
+  memory_efficient_layer_norm: false
+  microbatch_group_size_per_vp_stage: 1
+  moe_aux_loss_coeff: 0
+  moe_expert_capacity_factor: null
+  moe_extended_tp: false
+  moe_ffn_hidden_size: null
+  moe_grouped_gemm: false
+  moe_input_jitter_eps: null
+  moe_layer_freq: 1
+  moe_layer_recompute: false
+  moe_pad_expert_input_to_capacity: false
+  moe_per_layer_logging: false
+  moe_permute_fusion: false
+  moe_router_bias_update_rate: 0.001
+  moe_router_enable_expert_bias: false
+  moe_router_group_topk: null
+  moe_router_load_balancing_type: aux_loss
+  moe_router_num_groups: null
+  moe_router_pre_softmax: false
+  moe_router_score_function: softmax
+  moe_router_topk: 2
+  moe_router_topk_limited_devices: null
+  moe_router_topk_scaling_factor: null
+  moe_shared_expert_intermediate_size: null
+  moe_shared_expert_overlap: false
+  moe_token_dispatcher_type: allgather
+  moe_token_drop_policy: probs
+  moe_token_dropping: false
+  moe_use_legacy_grouped_gemm: false
+  moe_z_loss_coeff: null
+  multi_latent_attention: false
+  no_sync_func: null
+  normalization: RMSNorm
+  num_attention_heads: 32
+  num_layers: 16
+  num_layers_in_first_pipeline_stage: null
+  num_layers_in_last_pipeline_stage: null
+  num_microbatches_with_partial_activation_checkpoints: null
+  num_moe_experts: null
+  num_query_groups: 8
+  old_context_len: 8192
+  output_layer_init_method: null
+  overlap_p2p_comm: false
+  overlap_p2p_comm_warmup_flush: false
+  parallel_output: true
+  param_sync_func: null
+  params_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  perform_initialization: true
+  persist_layer_norm: true
+  pipeline_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  pipeline_model_parallel_size: 1
+  pipeline_model_parallel_split_rank: null
+  position_embedding_type: rope
+  qk_layernorm: false
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  rotary_base: 500000
+  rotary_interleaved: false
+  rotary_percent: 1.0
+  scale_factor: 32
+  scatter_embedding_sequence_parallel: true
+  seq_len_interpolation_factor: null
+  seq_length: 2048
+  sequence_parallel: false
+  share_embeddings_and_output_weights: true
+  softmax_scale: null
+  tensor_model_parallel_size: 1
+  test_mode: false
+  timers: null
+  tp_comm_atomic_ag: false
+  tp_comm_atomic_rs: false
+  tp_comm_bootstrap_backend: nccl
+  tp_comm_bulk_dgrad: true
+  tp_comm_bulk_wgrad: true
+  tp_comm_overlap: false
+  tp_comm_overlap_ag: true
+  tp_comm_overlap_disable_fc1: false
+  tp_comm_overlap_disable_qkv: false
+  tp_comm_overlap_rs: true
+  tp_comm_overlap_rs_dgrad: false
+  tp_comm_split_ag: true
+  tp_comm_split_rs: true
+  tp_only_amax_red: false
+  transformer_layer_spec:
+    _call_: false
+    _target_: nemo.collections.llm.gpt.model.base.default_layer_spec
+  use_cpu_initialization: false
+  use_ring_exchange_p2p: false
+  use_te_rng_tracker: false
+  use_transformer_engine_full_layer_spec: false
+  variable_seq_lengths: false
+  virtual_pipeline_model_parallel_size: null
+  wgrad_deferral_limit: 0
+  window_size: null
+model_transform: null
+optim:
+  _target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
+  config:
+    _target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    barrier_with_L1_time: false
+    bf16: true
+    clip_grad: 1.0
+    config_logger_dir: ''
+    decoupled_lr: null
+    decoupled_min_lr: null
+    exp_avg_dtype:
+      _call_: false
+      _target_: torch.float32
+    exp_avg_sq_dtype:
+      _call_: false
+      _target_: torch.float32
+    fp16: false
+    hysteresis: 2
+    initial_loss_scale: 4294967296
+    log_num_zeros_in_grad: false
+    loss_scale: null
+    loss_scale_window: 1000
+    lr: 0.0003
+    main_grads_dtype:
+      _call_: false
+      _target_: torch.float32
+    main_params_dtype:
+      _call_: false
+      _target_: torch.float32
+    min_loss_scale: 1.0
+    min_lr: null
+    optimizer: adam
+    overlap_param_gather_with_optimizer_step: false
+    params_dtype:
+      _call_: false
+      _target_: torch.float32
+    sgd_momentum: 0.9
+    timers: null
+    use_distributed_optimizer: true
+    use_precision_aware_optimizer: false
+    weight_decay: 0.1
+  lr_mult: 1.0
+  lr_scheduler:
+    _target_: nemo.lightning.pytorch.optim.lr_scheduler.CosineAnnealingScheduler
+    constant_steps: 0
+    frequency: 1
+    interval: step
+    max_steps: 10
+    min_lr: 2.9999999999999997e-05
+    monitor: val_loss
+    warmup_steps: 2000
+  no_weight_decay_cond: null
+  scale_lr_cond: null
+tokenizer:
+  _target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
+  additional_special_tokens: []
+  bos_token: null
+  cls_token: null
+  eos_token: null
+  include_special_tokens: false
+  mask_token: null
+  merges_file: null
+  pad_token: null
+  pretrained_model_name:
+    _target_: nemo.lightning.io.artifact.file.DirOrStringArtifact
+    attr: allenai/OLMo-1B-hf
+    required: true
+    skip: true
+  sep_token: null
+  trust_remote_code: false
+  unk_token: null
+  use_fast: true
+  vocab_file: tokenizer_config.json

model_name=0--step=1299-consumed_samples=665600.0-last/context/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,238 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

model_name=0--step=1299-consumed_samples=665600.0-last/weights/__1_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff015a1d8e5a41381de4c787b8868c7e443c1315c77ae43d50ce5192d89cf080
+size 310378496

model_name=0--step=1299-consumed_samples=665600.0-last/weights/__3_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57153877cf13fe09490ab7e00a5ac884bbd925fb29555e32829c75fb7d804b76
+size 297795584

model_name=0--step=1299-consumed_samples=665600.0-last/weights/__4_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79a23e6bbe69c8c364aecb5f24592e24e864252289169ee8084f6cf8a237a352
+size 288358400

model_name=0--step=1299-consumed_samples=665600.0-last/weights/__7_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a414da70b331ae88ec879fc328c7e6dd35ed96475476351797e6ffa5c5b833b
+size 303038464

model_name=0--step=299-consumed_samples=153600.0/context/1c2c7859-77a9-4039-bffd-0ebd68c1fc94 ADDED Viewed

Binary file (202 Bytes). View file

model_name=0--step=299-consumed_samples=153600.0/context/684b8564-68fa-41fa-9cdc-bbdf4eba2314 ADDED Viewed

Binary file (173 Bytes). View file

model_name=0--step=299-consumed_samples=153600.0/context/8ceb9fce-96b9-44bd-8e0f-96e8cbc13ea4 ADDED Viewed

Binary file (584 Bytes). View file

model_name=0--step=299-consumed_samples=153600.0/context/io.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"root": {"type": "ref", "key": "trainer_context_1"}, "objects": {"tuple_1": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "sequence_parallel"], ["Index(index=4)", "context_parallel_size"], ["Index(index=5)", "expert_model_parallel_size"], ["Index(index=6)", "expert_tensor_parallel_size"], ["Index(index=7)", "moe_extended_tp"], ["Index(index=8)", "bf16"], ["Index(index=9)", "params_dtype"], ["Index(index=10)", "autocast_dtype"], ["Index(index=11)", "use_te_rng_tracker"], ["Index(index=12)", "pipeline_dtype"], ["Index(index=13)", "microbatch_group_size_per_vp_stage"], ["Index(index=14)", "account_for_embedding_in_pipeline_split"], ["Index(index=15)", "account_for_loss_in_pipeline_split"], ["Index(index=16)", "share_embeddings_and_output_weights"], ["Index(index=17)", "seq_length"]], "metadata": null}, "dict_1": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_2": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "Llama32Config1B"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_1"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_1"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_2"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama32_config1_b_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.virtual_pipeline_model_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.sequence_parallel"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.context_parallel_size"]}], ["Attr(name='expert_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.expert_model_parallel_size"]}], ["Attr(name='expert_tensor_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.expert_tensor_parallel_size"]}], ["Attr(name='moe_extended_tp')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.moe_extended_tp"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='autocast_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='use_te_rng_tracker')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.use_te_rng_tracker"]}], ["Attr(name='pipeline_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='microbatch_group_size_per_vp_stage')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.microbatch_group_size_per_vp_stage"]}], ["Attr(name='account_for_embedding_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_embedding_in_pipeline_split"]}], ["Attr(name='account_for_loss_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_loss_in_pipeline_split"]}], ["Attr(name='share_embeddings_and_output_weights')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.share_embeddings_and_output_weights"]}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.model.config.seq_length"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_1"}, "paths": ["<root>.model.config"]}, "tuple_2": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "optimizer"], ["Index(index=1)", "lr"], ["Index(index=2)", "min_lr"], ["Index(index=3)", "decoupled_lr"], ["Index(index=4)", "decoupled_min_lr"], ["Index(index=5)", "weight_decay"], ["Index(index=6)", "fp16"], ["Index(index=7)", "bf16"], ["Index(index=8)", "params_dtype"], ["Index(index=9)", "use_precision_aware_optimizer"], ["Index(index=10)", "main_grads_dtype"], ["Index(index=11)", "main_params_dtype"], ["Index(index=12)", "exp_avg_dtype"], ["Index(index=13)", "exp_avg_sq_dtype"], ["Index(index=14)", "loss_scale"], ["Index(index=15)", "initial_loss_scale"], ["Index(index=16)", "min_loss_scale"], ["Index(index=17)", "loss_scale_window"], ["Index(index=18)", "hysteresis"], ["Index(index=19)", "adam_beta1"], ["Index(index=20)", "adam_beta2"], ["Index(index=21)", "adam_eps"], ["Index(index=22)", "sgd_momentum"], ["Index(index=23)", "use_distributed_optimizer"], ["Index(index=24)", "overlap_param_gather_with_optimizer_step"], ["Index(index=25)", "clip_grad"], ["Index(index=26)", "log_num_zeros_in_grad"], ["Index(index=27)", "barrier_with_L1_time"], ["Index(index=28)", "timers"], ["Index(index=29)", "config_logger_dir"]], "metadata": null}, "dict_3": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_4": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.optimizer.optimizer_config", "name": "OptimizerConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_2"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_3"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_4"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "optimizer_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='optimizer')", {"type": "leaf", "value": "adam", "paths": ["<root>.model.optim.config.optimizer"]}], ["Attr(name='lr')", {"type": "leaf", "value": 0.0003, "paths": ["<root>.model.optim.config.lr"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.min_lr"]}], ["Attr(name='decoupled_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_lr"]}], ["Attr(name='decoupled_min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_min_lr"]}], ["Attr(name='weight_decay')", {"type": "leaf", "value": 0.1, "paths": ["<root>.model.optim.config.weight_decay"]}], ["Attr(name='fp16')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.fp16"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='use_precision_aware_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.use_precision_aware_optimizer"]}], ["Attr(name='main_grads_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='main_params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_sq_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='loss_scale')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.loss_scale"]}], ["Attr(name='initial_loss_scale')", {"type": "leaf", "value": 4294967296, "paths": ["<root>.model.optim.config.initial_loss_scale"]}], ["Attr(name='min_loss_scale')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.min_loss_scale"]}], ["Attr(name='loss_scale_window')", {"type": "leaf", "value": 1000, "paths": ["<root>.model.optim.config.loss_scale_window"]}], ["Attr(name='hysteresis')", {"type": "leaf", "value": 2, "paths": ["<root>.model.optim.config.hysteresis"]}], ["Attr(name='adam_beta1')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.adam_beta1"]}], ["Attr(name='adam_beta2')", {"type": "leaf", "value": 0.95, "paths": ["<root>.model.optim.config.adam_beta2"]}], ["Attr(name='adam_eps')", {"type": "leaf", "value": 1e-05, "paths": ["<root>.model.optim.config.adam_eps"]}], ["Attr(name='sgd_momentum')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.sgd_momentum"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.use_distributed_optimizer"]}], ["Attr(name='overlap_param_gather_with_optimizer_step')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.overlap_param_gather_with_optimizer_step"]}], ["Attr(name='clip_grad')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.clip_grad"]}], ["Attr(name='log_num_zeros_in_grad')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.log_num_zeros_in_grad"]}], ["Attr(name='barrier_with_L1_time')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.barrier_with_L1_time"]}], ["Attr(name='timers')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.timers"]}], ["Attr(name='config_logger_dir')", {"type": "leaf", "value": "", "paths": ["<root>.model.optim.config.config_logger_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_2"}, "paths": ["<root>.model.optim.config"]}, "tuple_3": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "warmup_steps"], ["Index(index=1)", "constant_steps"], ["Index(index=2)", "min_lr"]], "metadata": null}, "dict_5": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_6": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_3": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.lr_scheduler", "name": "CosineAnnealingScheduler"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_3"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_5"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_6"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "cosine_annealing_scheduler_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='warmup_steps')", {"type": "leaf", "value": 2000, "paths": ["<root>.model.optim.lr_scheduler.warmup_steps"]}], ["Attr(name='constant_steps')", {"type": "leaf", "value": 0, "paths": ["<root>.model.optim.lr_scheduler.constant_steps"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": 2.9999999999999997e-05, "paths": ["<root>.model.optim.lr_scheduler.min_lr"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_3"}, "paths": ["<root>.model.optim.lr_scheduler"]}, "tuple_4": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "lr_scheduler"]], "metadata": null}, "dict_7": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_8": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_4": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.megatron", "name": "MegatronOptimizerModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_4"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_7"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_8"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_optimizer_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "optimizer_config_1"}], ["Attr(name='lr_scheduler')", {"type": "ref", "key": "cosine_annealing_scheduler_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_4"}, "paths": ["<root>.model.optim"]}, "tuple_5": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "attr"], ["Index(index=1)", "skip"]], "metadata": null}, "dict_9": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_10": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_5": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.artifact.file", "name": "DirOrStringArtifact"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_5"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_9"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_10"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "dir_or_string_artifact_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='attr')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.model.tokenizer.pretrained_model_name.attr"]}], ["Attr(name='skip')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.pretrained_model_name.skip"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_5"}, "paths": ["<root>.model.tokenizer.pretrained_model_name"]}, "tuple_6": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_11": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_12": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_6": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_6"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_11"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_12"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "ref", "key": "dir_or_string_artifact_1"}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "tokenizer_config.json", "paths": ["<root>.model.tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_6"}, "paths": ["<root>.model.tokenizer"]}, "tuple_7": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "optim"], ["Index(index=2)", "tokenizer"]], "metadata": null}, "dict_13": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_14": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_7": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "LlamaModel"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_7"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_13"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_14"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama_model_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "llama32_config1_b_1"}], ["Attr(name='optim')", {"type": "ref", "key": "megatron_optimizer_module_1"}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_7"}, "paths": ["<root>.model"]}, "tuple_8": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "grad_reduce_in_fp32"], ["Index(index=1)", "overlap_grad_reduce"], ["Index(index=2)", "overlap_param_gather"], ["Index(index=3)", "align_param_gather"], ["Index(index=4)", "use_distributed_optimizer"], ["Index(index=5)", "num_distributed_optimizer_instances"], ["Index(index=6)", "check_for_nan_in_grad"], ["Index(index=7)", "bucket_size"], ["Index(index=8)", "average_in_collective"], ["Index(index=9)", "fp8_param_gather"]], "metadata": null}, "dict_15": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_16": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_8": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.distributed.distributed_data_parallel_config", "name": "DistributedDataParallelConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_8"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_15"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_16"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "distributed_data_parallel_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='grad_reduce_in_fp32')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.grad_reduce_in_fp32"]}], ["Attr(name='overlap_grad_reduce')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_grad_reduce"]}], ["Attr(name='overlap_param_gather')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_param_gather"]}], ["Attr(name='align_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.align_param_gather"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.use_distributed_optimizer"]}], ["Attr(name='num_distributed_optimizer_instances')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.ddp.num_distributed_optimizer_instances"]}], ["Attr(name='check_for_nan_in_grad')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.check_for_nan_in_grad"]}], ["Attr(name='bucket_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.ddp.bucket_size"]}], ["Attr(name='average_in_collective')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.average_in_collective"]}], ["Attr(name='fp8_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.fp8_param_gather"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_8"}, "paths": ["<root>.trainer.strategy.ddp"]}, "tuple_9": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "gradient_as_bucket_view"]], "metadata": null}, "dict_17": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='gradient_as_bucket_view')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.kwargs['gradient_as_bucket_view']"]}]], "metadata": {"type": "ref", "key": "tuple_9"}, "paths": ["<root>.trainer.strategy.kwargs"]}, "tuple_10": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "context_parallel_size"], ["Index(index=4)", "sequence_parallel"], ["Index(index=5)", "ddp"], ["Index(index=6)", "pipeline_dtype"], ["Index(index=7)", "ckpt_async_save"], ["Index(index=8)", "ckpt_parallel_load"], ["Index(index=9)", "kwargs"]], "metadata": null}, "dict_18": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_19": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_9": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.strategies.megatron_strategy", "name": "MegatronStrategy"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_10"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_18"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_19"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_strategy_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.virtual_pipeline_model_parallel_size"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.context_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.sequence_parallel"]}], ["Attr(name='ddp')", {"type": "ref", "key": "distributed_data_parallel_config_1"}], ["Attr(name='pipeline_dtype')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.pipeline_dtype"]}], ["Attr(name='ckpt_async_save')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_async_save"]}], ["Attr(name='ckpt_parallel_load')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_parallel_load"]}], ["Attr(name='kwargs')", {"type": "ref", "key": "dict_17"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_9"}, "paths": ["<root>.trainer.strategy"]}, "timing_callback_1": {"type": {"type": "pyref", "module": "nemo.utils.exp_manager", "name": "TimingCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "1c2c7859-77a9-4039-bffd-0ebd68c1fc94", "paths": ["<root>.trainer.callbacks[0]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[0]"]}, "garbage_collection_callback_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.callbacks.garbage_collection", "name": "GarbageCollectionCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "684b8564-68fa-41fa-9cdc-bbdf4eba2314", "paths": ["<root>.trainer.callbacks[1]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[1]"]}, "list_1": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "ref", "key": "timing_callback_1"}], ["Index(index=1)", {"type": "ref", "key": "garbage_collection_callback_1"}]], "metadata": null, "paths": ["<root>.trainer.callbacks"]}, "megatron_mixed_precision_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.plugins.mixed_precision", "name": "MegatronMixedPrecision"}, "items": [["IdentityElement()", {"type": "leaf", "value": "8ceb9fce-96b9-44bd-8e0f-96e8cbc13ea4", "paths": ["<root>.trainer.plugins"]}]], "metadata": null, "paths": ["<root>.trainer.plugins"]}, "tuple_11": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "accelerator"], ["Index(index=1)", "strategy"], ["Index(index=2)", "devices"], ["Index(index=3)", "num_nodes"], ["Index(index=4)", "callbacks"], ["Index(index=5)", "max_steps"], ["Index(index=6)", "limit_val_batches"], ["Index(index=7)", "val_check_interval"], ["Index(index=8)", "log_every_n_steps"], ["Index(index=9)", "accumulate_grad_batches"], ["Index(index=10)", "use_distributed_sampler"], ["Index(index=11)", "plugins"]], "metadata": null}, "dict_20": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_21": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_10": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.trainer", "name": "Trainer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_11"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_20"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_21"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='accelerator')", {"type": "leaf", "value": "gpu", "paths": ["<root>.trainer.accelerator"]}], ["Attr(name='strategy')", {"type": "ref", "key": "megatron_strategy_1"}], ["Attr(name='devices')", {"type": "leaf", "value": 8, "paths": ["<root>.trainer.devices"]}], ["Attr(name='num_nodes')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.num_nodes"]}], ["Attr(name='callbacks')", {"type": "ref", "key": "list_1"}], ["Attr(name='max_steps')", {"type": "leaf", "value": 1168251, "paths": ["<root>.trainer.max_steps"]}], ["Attr(name='limit_val_batches')", {"type": "leaf", "value": 32, "paths": ["<root>.trainer.limit_val_batches"]}], ["Attr(name='val_check_interval')", {"type": "leaf", "value": 100, "paths": ["<root>.trainer.val_check_interval"]}], ["Attr(name='log_every_n_steps')", {"type": "leaf", "value": 10, "paths": ["<root>.trainer.log_every_n_steps"]}], ["Attr(name='accumulate_grad_batches')", {"type": "leaf", "value": 4, "paths": ["<root>.trainer.accumulate_grad_batches"]}], ["Attr(name='use_distributed_sampler')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.use_distributed_sampler"]}], ["Attr(name='plugins')", {"type": "ref", "key": "megatron_mixed_precision_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_10"}, "paths": ["<root>.trainer"]}, "list_2": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "leaf", "value": "Data/dclm_local_shard_1_megatron/concatenated.jsonl_text_document", "paths": ["<root>.extra['datamodule'].paths[0]"]}]], "metadata": null, "paths": ["<root>.extra['datamodule'].paths"]}, "tuple_12": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_22": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_23": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_11": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_12"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_22"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_23"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.extra['datamodule'].tokenizer.pretrained_model_name"]}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "Data/tokenizer/tokenizer_config.json", "paths": ["<root>.extra['datamodule'].tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.extra['datamodule'].tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_11"}, "paths": ["<root>.extra['datamodule'].tokenizer"]}, "tuple_13": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "paths"], ["Index(index=1)", "seq_length"], ["Index(index=2)", "tokenizer"], ["Index(index=3)", "micro_batch_size"], ["Index(index=4)", "global_batch_size"], ["Index(index=5)", "split"], ["Index(index=6)", "index_mapping_dir"]], "metadata": null}, "dict_24": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_25": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_12": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.data.pre_training", "name": "PreTrainingDataModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_13"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_24"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_25"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "pre_training_data_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='paths')", {"type": "ref", "key": "list_2"}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.extra['datamodule'].seq_length"]}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_2"}], ["Attr(name='micro_batch_size')", {"type": "leaf", "value": 16, "paths": ["<root>.extra['datamodule'].micro_batch_size"]}], ["Attr(name='global_batch_size')", {"type": "leaf", "value": 512, "paths": ["<root>.extra['datamodule'].global_batch_size"]}], ["Attr(name='split')", {"type": "leaf", "value": "99,8,2", "paths": ["<root>.extra['datamodule'].split"]}], ["Attr(name='index_mapping_dir')", {"type": "leaf", "value": "Data/index_mapping_local_shard_1", "paths": ["<root>.extra['datamodule'].index_mapping_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_12"}, "paths": ["<root>.extra['datamodule']"]}, "tuple_14": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "datamodule"]], "metadata": null}, "dict_26": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='datamodule')", {"type": "ref", "key": "pre_training_data_module_1"}]], "metadata": {"type": "ref", "key": "tuple_14"}, "paths": ["<root>.extra"]}, "tuple_15": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "model"], ["Index(index=1)", "trainer"], ["Index(index=2)", "extra"]], "metadata": null}, "dict_27": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_28": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_13": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.pl", "name": "TrainerContext"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_15"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_27"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_28"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_context_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='model')", {"type": "ref", "key": "llama_model_1"}], ["Attr(name='trainer')", {"type": "ref", "key": "trainer_1"}], ["Attr(name='extra')", {"type": "ref", "key": "dict_26"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_13"}, "paths": ["<root>"]}}, "refcounts": {"tuple_1": 1, "dict_1": 1, "dict_2": 1, "buildable_traverser_metadata_1": 1, "llama32_config1_b_1": 1, "tuple_2": 1, "dict_3": 1, "dict_4": 1, "buildable_traverser_metadata_2": 1, "optimizer_config_1": 1, "tuple_3": 1, "dict_5": 1, "dict_6": 1, "buildable_traverser_metadata_3": 1, "cosine_annealing_scheduler_1": 1, "tuple_4": 1, "dict_7": 1, "dict_8": 1, "buildable_traverser_metadata_4": 1, "megatron_optimizer_module_1": 1, "tuple_5": 1, "dict_9": 1, "dict_10": 1, "buildable_traverser_metadata_5": 1, "dir_or_string_artifact_1": 1, "tuple_6": 1, "dict_11": 1, "dict_12": 1, "buildable_traverser_metadata_6": 1, "auto_tokenizer_1": 1, "tuple_7": 1, "dict_13": 1, "dict_14": 1, "buildable_traverser_metadata_7": 1, "llama_model_1": 1, "tuple_8": 1, "dict_15": 1, "dict_16": 1, "buildable_traverser_metadata_8": 1, "distributed_data_parallel_config_1": 1, "tuple_9": 1, "dict_17": 1, "tuple_10": 1, "dict_18": 1, "dict_19": 1, "buildable_traverser_metadata_9": 1, "megatron_strategy_1": 1, "timing_callback_1": 1, "garbage_collection_callback_1": 1, "list_1": 1, "megatron_mixed_precision_1": 1, "tuple_11": 1, "dict_20": 1, "dict_21": 1, "buildable_traverser_metadata_10": 1, "trainer_1": 1, "list_2": 1, "tuple_12": 1, "dict_22": 1, "dict_23": 1, "buildable_traverser_metadata_11": 1, "auto_tokenizer_2": 1, "tuple_13": 1, "dict_24": 1, "dict_25": 1, "buildable_traverser_metadata_12": 1, "pre_training_data_module_1": 1, "tuple_14": 1, "dict_26": 1, "tuple_15": 1, "dict_27": 1, "dict_28": 1, "buildable_traverser_metadata_13": 1, "trainer_context_1": 1}, "version": "0.0.1"}

model_name=0--step=299-consumed_samples=153600.0/context/model.yaml ADDED Viewed

	@@ -0,0 +1,266 @@

+_target_: nemo.collections.llm.gpt.model.llama.LlamaModel
+config:
+  _cpu_offloading_context: null
+  _target_: nemo.collections.llm.gpt.model.llama.Llama32Config1B
+  account_for_embedding_in_pipeline_split: false
+  account_for_loss_in_pipeline_split: false
+  activation_func:
+    _call_: false
+    _target_: torch.nn.functional.silu
+  activation_func_fp8_input_store: false
+  add_bias_linear: false
+  add_qkv_bias: false
+  apply_query_key_layer_scaling: false
+  apply_residual_connection_post_layernorm: false
+  apply_rope_fusion: true
+  async_tensor_model_parallel_allreduce: false
+  attention_backend:
+    _call_: true
+    _target_: megatron.core.transformer.enums.AttnBackend
+  attention_dropout: 0.0
+  attention_softmax_in_fp32: false
+  autocast_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  barrier_with_L1_time: true
+  batch_p2p_comm: true
+  batch_p2p_sync: true
+  bf16: true
+  bias_activation_fusion: true
+  bias_dropout_fusion: true
+  calculate_per_token_loss: false
+  clone_scatter_output_in_embedding: true
+  config_logger_dir: ''
+  context_parallel_size: 1
+  cp_comm_type: null
+  cpu_offloading: false
+  cpu_offloading_activations: true
+  cpu_offloading_num_layers: 0
+  cpu_offloading_weights: true
+  cross_entropy_loss_fusion: true
+  cuda_graph_retain_backward_graph: false
+  cuda_graph_use_single_mempool: false
+  cuda_graph_warmup_steps: 3
+  data_step_fn:
+    _call_: false
+    _target_: nemo.collections.llm.gpt.model.base.gpt_data_step
+  deallocate_pipeline_outputs: true
+  defer_embedding_wgrad_compute: false
+  deterministic_mode: false
+  disable_parameter_transpose_cache: false
+  distribute_saved_activations: null
+  enable_autocast: false
+  enable_cuda_graph: false
+  expert_model_parallel_size: 1
+  expert_tensor_parallel_size: null
+  external_cuda_graph: false
+  ffn_hidden_size: 8192
+  finalize_model_grads_func: null
+  flash_decode: false
+  forward_step_fn:
+    _call_: false
+    _target_: nemo.collections.llm.gpt.model.base.gpt_forward_step
+  fp16: false
+  fp16_lm_cross_entropy: false
+  fp32_residual_connection: false
+  fp8: null
+  fp8_amax_compute_algo: most_recent
+  fp8_amax_history_len: 1
+  fp8_dot_product_attention: false
+  fp8_interval: 1
+  fp8_margin: 0
+  fp8_multi_head_attention: false
+  fp8_wgrad: true
+  gated_linear_unit: true
+  grad_scale_func: null
+  grad_sync_func: null
+  gradient_accumulation_fusion: true
+  hidden_dropout: 0.0
+  hidden_size: 2048
+  hierarchical_context_parallel_sizes: null
+  high_freq_factor: 4
+  inference_rng_tracker: false
+  init_method: null
+  init_method_std: 0.02
+  kv_channels: null
+  layernorm_epsilon: 1.0e-05
+  layernorm_zero_centered_gamma: false
+  low_freq_factor: 1
+  make_vocab_size_divisible_by: 128
+  masked_softmax_fusion: true
+  memory_efficient_layer_norm: false
+  microbatch_group_size_per_vp_stage: 1
+  moe_aux_loss_coeff: 0
+  moe_expert_capacity_factor: null
+  moe_extended_tp: false
+  moe_ffn_hidden_size: null
+  moe_grouped_gemm: false
+  moe_input_jitter_eps: null
+  moe_layer_freq: 1
+  moe_layer_recompute: false
+  moe_pad_expert_input_to_capacity: false
+  moe_per_layer_logging: false
+  moe_permute_fusion: false
+  moe_router_bias_update_rate: 0.001
+  moe_router_enable_expert_bias: false
+  moe_router_group_topk: null
+  moe_router_load_balancing_type: aux_loss
+  moe_router_num_groups: null
+  moe_router_pre_softmax: false
+  moe_router_score_function: softmax
+  moe_router_topk: 2
+  moe_router_topk_limited_devices: null
+  moe_router_topk_scaling_factor: null
+  moe_shared_expert_intermediate_size: null
+  moe_shared_expert_overlap: false
+  moe_token_dispatcher_type: allgather
+  moe_token_drop_policy: probs
+  moe_token_dropping: false
+  moe_use_legacy_grouped_gemm: false
+  moe_z_loss_coeff: null
+  multi_latent_attention: false
+  no_sync_func: null
+  normalization: RMSNorm
+  num_attention_heads: 32
+  num_layers: 16
+  num_layers_in_first_pipeline_stage: null
+  num_layers_in_last_pipeline_stage: null
+  num_microbatches_with_partial_activation_checkpoints: null
+  num_moe_experts: null
+  num_query_groups: 8
+  old_context_len: 8192
+  output_layer_init_method: null
+  overlap_p2p_comm: false
+  overlap_p2p_comm_warmup_flush: false
+  parallel_output: true
+  param_sync_func: null
+  params_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  perform_initialization: true
+  persist_layer_norm: true
+  pipeline_dtype:
+    _call_: false
+    _target_: torch.bfloat16
+  pipeline_model_parallel_size: 1
+  pipeline_model_parallel_split_rank: null
+  position_embedding_type: rope
+  qk_layernorm: false
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  rotary_base: 500000
+  rotary_interleaved: false
+  rotary_percent: 1.0
+  scale_factor: 32
+  scatter_embedding_sequence_parallel: true
+  seq_len_interpolation_factor: null
+  seq_length: 2048
+  sequence_parallel: false
+  share_embeddings_and_output_weights: true
+  softmax_scale: null
+  tensor_model_parallel_size: 1
+  test_mode: false
+  timers: null
+  tp_comm_atomic_ag: false
+  tp_comm_atomic_rs: false
+  tp_comm_bootstrap_backend: nccl
+  tp_comm_bulk_dgrad: true
+  tp_comm_bulk_wgrad: true
+  tp_comm_overlap: false
+  tp_comm_overlap_ag: true
+  tp_comm_overlap_disable_fc1: false
+  tp_comm_overlap_disable_qkv: false
+  tp_comm_overlap_rs: true
+  tp_comm_overlap_rs_dgrad: false
+  tp_comm_split_ag: true
+  tp_comm_split_rs: true
+  tp_only_amax_red: false
+  transformer_layer_spec:
+    _call_: false
+    _target_: nemo.collections.llm.gpt.model.base.default_layer_spec
+  use_cpu_initialization: false
+  use_ring_exchange_p2p: false
+  use_te_rng_tracker: false
+  use_transformer_engine_full_layer_spec: false
+  variable_seq_lengths: false
+  virtual_pipeline_model_parallel_size: null
+  wgrad_deferral_limit: 0
+  window_size: null
+model_transform: null
+optim:
+  _target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
+  config:
+    _target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    barrier_with_L1_time: false
+    bf16: true
+    clip_grad: 1.0
+    config_logger_dir: ''
+    decoupled_lr: null
+    decoupled_min_lr: null
+    exp_avg_dtype:
+      _call_: false
+      _target_: torch.float32
+    exp_avg_sq_dtype:
+      _call_: false
+      _target_: torch.float32
+    fp16: false
+    hysteresis: 2
+    initial_loss_scale: 4294967296
+    log_num_zeros_in_grad: false
+    loss_scale: null
+    loss_scale_window: 1000
+    lr: 0.0003
+    main_grads_dtype:
+      _call_: false
+      _target_: torch.float32
+    main_params_dtype:
+      _call_: false
+      _target_: torch.float32
+    min_loss_scale: 1.0
+    min_lr: null
+    optimizer: adam
+    overlap_param_gather_with_optimizer_step: false
+    params_dtype:
+      _call_: false
+      _target_: torch.float32
+    sgd_momentum: 0.9
+    timers: null
+    use_distributed_optimizer: true
+    use_precision_aware_optimizer: false
+    weight_decay: 0.1
+  lr_mult: 1.0
+  lr_scheduler:
+    _target_: nemo.lightning.pytorch.optim.lr_scheduler.CosineAnnealingScheduler
+    constant_steps: 0
+    frequency: 1
+    interval: step
+    max_steps: 10
+    min_lr: 2.9999999999999997e-05
+    monitor: val_loss
+    warmup_steps: 2000
+  no_weight_decay_cond: null
+  scale_lr_cond: null
+tokenizer:
+  _target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
+  additional_special_tokens: []
+  bos_token: null
+  cls_token: null
+  eos_token: null
+  include_special_tokens: false
+  mask_token: null
+  merges_file: null
+  pad_token: null
+  pretrained_model_name:
+    _target_: nemo.lightning.io.artifact.file.DirOrStringArtifact
+    attr: allenai/OLMo-1B-hf
+    required: true
+    skip: true
+  sep_token: null
+  trust_remote_code: false
+  unk_token: null
+  use_fast: true
+  vocab_file: tokenizer_config.json

model_name=0--step=299-consumed_samples=153600.0/context/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,238 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|padding|>",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": null
+}

model_name=0--step=399-consumed_samples=204800.0/weights/__0_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc35cd0258d73a250354e7531205b418e5b6501c9377fc0272f5b7d2ce07cde2
+size 938897288

model_name=0--step=399-consumed_samples=204800.0/weights/__0_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a9dfdc4c494fefee0538980e247b02e04e45d621cda53dd7f5469c593bb5a5b
+size 940460984

model_name=0--step=399-consumed_samples=204800.0/weights/__1_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7411a1b07a621db57044dc89b395d933f4582a88597f4e6f7a3e95dadca1303
+size 938851248

model_name=0--step=399-consumed_samples=204800.0/weights/__2_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:377eb6fd322b702cc00cb4c11730c6c7282cca1432d1e15aa596e07e12fb272b
+size 937781988

model_name=0--step=399-consumed_samples=204800.0/weights/__2_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a1fcfd263ae23d278ee6d0a9ce1d611cac15b64b46e03e3f5214c9af637fefe
+size 944982044

model_name=0--step=399-consumed_samples=204800.0/weights/__3_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b93045fdd3c3acedd413bde0b22f0c336b4bcf29686ef3a26e6215522df08dd
+size 943954152

model_name=0--step=399-consumed_samples=204800.0/weights/__4_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a75424ec18330730377bddbf462c05a43c9e8884d5f85731375a0ec5e52b53e
+size 944985984

model_name=0--step=399-consumed_samples=204800.0/weights/__5_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85d6c4912dd2eaccfecca2598a7adcc7ecfe31afa49ea40beddc5ed19e81a81c
+size 941995240

model_name=0--step=399-consumed_samples=204800.0/weights/__5_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f865553aec5a86cfd417cf5dce51a88c957f7e18c03a100793407c6ab4d38b5f
+size 945017376

model_name=0--step=399-consumed_samples=204800.0/weights/__6_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5da31130ab0f68e9d1699b00b72be07cc717375df01eaa5b020f4cf61866840e
+size 941969992

model_name=0--step=399-consumed_samples=204800.0/weights/__7_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49e67752f947c893d549ac9e1c5d4e359cc13697aad8d24ba078be4793892483
+size 938826468

model_name=0--step=499-consumed_samples=256000.0/weights/__3_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:088024280af94b80de36da6945ae0208959434b7fae2b2181adea075dbd16660
+size 943054924

model_name=0--step=499-consumed_samples=256000.0/weights/__5_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d241e931336e3ea7335d3dc957bf56bb19783b224aefa526923e969c16985c3c
+size 941995240

model_name=0--step=499-consumed_samples=256000.0/weights/__7_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b39b565396c97c161ae66bcd7c5de809bee3f3d89ac5222312a3ef26d5de5ccb
+size 938826468

model_name=0--step=599-consumed_samples=307200.0/weights/__5_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:252f35fec37db96e24f06741f06ae3cb18fa7e6e50813e459edbd5052752a626
+size 941995240

model_name=0--step=599-consumed_samples=307200.0/weights/__7_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:717a3881201a7ccab60e6889592529aa6ec106d77a876b500048ecc057d04aab
+size 943936384

model_name=0--step=699-consumed_samples=358400.0/weights/__7_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba4ac54fb4679911329d41f8efc45403c967327409c2a6894dae3c584c8685c1
+size 943936384

model_name=0--step=799-consumed_samples=409600.0/weights/__0_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0847f68a2b02a59cebf510dabc496c1546733e6aeadd44f4a5d9496f2388a69f
+size 940460984

model_name=0--step=799-consumed_samples=409600.0/weights/__3_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9c963d152e1583cab181ea1e3280088a594c821c0246d2bfcb9f8a000f153b8
+size 943054924

model_name=0--step=799-consumed_samples=409600.0/weights/__4_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:adb0172b2019231c97283bc3e7398a63b09c101f2eb9d8cff8489d1798d967cb
+size 941969992

model_name=0--step=799-consumed_samples=409600.0/weights/__6_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85f9cfa3bfd99140790b700216fd626bcffacaff5952a567fee7f0ed081c967a
+size 941969992

model_name=0--step=799-consumed_samples=409600.0/weights/__7_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8914b3b38ed5df9e7f55d83d4cde970e3f71d18fca65a79ba325235b75a9c58f
+size 943936384

model_name=0--step=999-consumed_samples=512000.0/weights/__0_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d66bda2796a2b65191702da913d1c4b996dac1f30240f38817a642943d98b532
+size 940460984

model_name=0--step=999-consumed_samples=512000.0/weights/__1_1.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95f9aef5f02a7b203e594a03cf2e1e5e75afe171035cf42d745c85b02f241321
+size 943962344

model_name=0--step=999-consumed_samples=512000.0/weights/__2_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6f45a1901fdbbfa8d3cd0e10c6f89f524f2e3d8a7e4aa690ce93629f869524c
+size 937781988

model_name=0--step=999-consumed_samples=512000.0/weights/__5_0.distcp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7409b45e4f5d29bdda4dfb2c335122630297d578dc200ed83131a3ab4a19225
+size 941995240