Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +72 -0
- model_name=0--step=1099-consumed_samples=563200.0/context/365df6c4-8992-4ca2-aaa5-ef3123ca40ed +0 -0
- model_name=0--step=1099-consumed_samples=563200.0/context/458204eb-6487-422a-8674-610fd32a94cd +0 -0
- model_name=0--step=1099-consumed_samples=563200.0/context/e5e4999a-9ee0-4a4e-9cf6-c4f09d01bec2 +0 -0
- model_name=0--step=1099-consumed_samples=563200.0/context/io.json +1 -0
- model_name=0--step=1099-consumed_samples=563200.0/context/model.yaml +266 -0
- model_name=0--step=1099-consumed_samples=563200.0/context/tokenizer_config.json +238 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/.metadata +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__0_0.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__0_1.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__1_0.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__1_1.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__2_0.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__2_1.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__3_0.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__3_1.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__4_0.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__4_1.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__5_0.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__5_1.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__6_0.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__6_1.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__7_0.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/__7_1.distcp +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/common.pt +3 -0
- model_name=0--step=1099-consumed_samples=563200.0/weights/metadata.json +1 -0
- model_name=0--step=1199-consumed_samples=614400.0/context/7f55e7bc-67d0-43b6-a099-17d6faf84264 +0 -0
- model_name=0--step=1199-consumed_samples=614400.0/context/bcb2d43c-8276-4afb-b1da-8ef068679e7c +0 -0
- model_name=0--step=1199-consumed_samples=614400.0/context/d0be63b8-65ec-4740-b706-58027d280788 +0 -0
- model_name=0--step=1199-consumed_samples=614400.0/context/io.json +1 -0
- model_name=0--step=1199-consumed_samples=614400.0/context/model.yaml +266 -0
- model_name=0--step=1199-consumed_samples=614400.0/context/tokenizer_config.json +238 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/.metadata +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__0_0.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__0_1.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__1_0.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__2_0.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__2_1.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__3_1.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__4_0.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__5_0.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__5_1.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__6_0.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__6_1.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/__7_0.distcp +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/common.pt +3 -0
- model_name=0--step=1199-consumed_samples=614400.0/weights/metadata.json +1 -0
- model_name=0--step=1299-consumed_samples=665600.0-last-unfinished +0 -0
- model_name=0--step=1299-consumed_samples=665600.0-unfinished +0 -0
- model_name=0--step=1299-consumed_samples=665600.0/context/54c4eb36-2ea5-4b54-a535-213b8bd850a2 +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,75 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
model_name=0--step=499-consumed_samples=256000.0/weights/.metadata filter=lfs diff=lfs merge=lfs -text
|
37 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/.metadata filter=lfs diff=lfs merge=lfs -text
|
38 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/.metadata filter=lfs diff=lfs merge=lfs -text
|
39 |
+
model_name=0--step=499-consumed_samples=256000.0/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
|
40 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
|
41 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
|
42 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
|
43 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
|
44 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
|
45 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
|
46 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
|
47 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
|
48 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/.metadata filter=lfs diff=lfs merge=lfs -text
|
49 |
+
model_name=0--step=499-consumed_samples=256000.0/weights/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
|
50 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/.metadata filter=lfs diff=lfs merge=lfs -text
|
51 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
|
52 |
+
model_name=0--step=499-consumed_samples=256000.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
|
53 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
|
54 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
|
55 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
|
56 |
+
model_name=0--step=599-consumed_samples=307200.0/weights/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
|
57 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
|
58 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
|
59 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
|
60 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
|
61 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
|
62 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
|
63 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
|
64 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
|
65 |
+
model_name=0--step=99-consumed_samples=51200.0/weights/.metadata filter=lfs diff=lfs merge=lfs -text
|
66 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
|
67 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
|
68 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
|
69 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
|
70 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
|
71 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
|
72 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
|
73 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
|
74 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
|
75 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
|
76 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
|
77 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
|
78 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
|
79 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
|
80 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
|
81 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
|
82 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
|
83 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
|
84 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
|
85 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
|
86 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
|
87 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
|
88 |
+
model_name=0--step=1099-consumed_samples=563200.0/weights/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
|
89 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
|
90 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
|
91 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
|
92 |
+
model_name=0--step=199-consumed_samples=102400.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
|
93 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
|
94 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
|
95 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
|
96 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
|
97 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
|
98 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
|
99 |
+
model_name=0--step=899-consumed_samples=460800.0/weights/.metadata filter=lfs diff=lfs merge=lfs -text
|
100 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
|
101 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
|
102 |
+
model_name=0--step=1199-consumed_samples=614400.0/weights/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
|
103 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
|
104 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
|
105 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
|
106 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
|
107 |
+
model_name=0--step=1299-consumed_samples=665600.0/weights/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
|
model_name=0--step=1099-consumed_samples=563200.0/context/365df6c4-8992-4ca2-aaa5-ef3123ca40ed
ADDED
Binary file (584 Bytes). View file
|
|
model_name=0--step=1099-consumed_samples=563200.0/context/458204eb-6487-422a-8674-610fd32a94cd
ADDED
Binary file (202 Bytes). View file
|
|
model_name=0--step=1099-consumed_samples=563200.0/context/e5e4999a-9ee0-4a4e-9cf6-c4f09d01bec2
ADDED
Binary file (173 Bytes). View file
|
|
model_name=0--step=1099-consumed_samples=563200.0/context/io.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"root": {"type": "ref", "key": "trainer_context_1"}, "objects": {"tuple_1": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "sequence_parallel"], ["Index(index=4)", "context_parallel_size"], ["Index(index=5)", "expert_model_parallel_size"], ["Index(index=6)", "expert_tensor_parallel_size"], ["Index(index=7)", "moe_extended_tp"], ["Index(index=8)", "bf16"], ["Index(index=9)", "params_dtype"], ["Index(index=10)", "autocast_dtype"], ["Index(index=11)", "use_te_rng_tracker"], ["Index(index=12)", "pipeline_dtype"], ["Index(index=13)", "microbatch_group_size_per_vp_stage"], ["Index(index=14)", "account_for_embedding_in_pipeline_split"], ["Index(index=15)", "account_for_loss_in_pipeline_split"], ["Index(index=16)", "share_embeddings_and_output_weights"], ["Index(index=17)", "seq_length"]], "metadata": null}, "dict_1": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_2": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "Llama32Config1B"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_1"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_1"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_2"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama32_config1_b_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.virtual_pipeline_model_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.sequence_parallel"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.context_parallel_size"]}], ["Attr(name='expert_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.expert_model_parallel_size"]}], ["Attr(name='expert_tensor_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.expert_tensor_parallel_size"]}], ["Attr(name='moe_extended_tp')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.moe_extended_tp"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='autocast_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='use_te_rng_tracker')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.use_te_rng_tracker"]}], ["Attr(name='pipeline_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='microbatch_group_size_per_vp_stage')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.microbatch_group_size_per_vp_stage"]}], ["Attr(name='account_for_embedding_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_embedding_in_pipeline_split"]}], ["Attr(name='account_for_loss_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_loss_in_pipeline_split"]}], ["Attr(name='share_embeddings_and_output_weights')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.share_embeddings_and_output_weights"]}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.model.config.seq_length"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_1"}, "paths": ["<root>.model.config"]}, "tuple_2": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "optimizer"], ["Index(index=1)", "lr"], ["Index(index=2)", "min_lr"], ["Index(index=3)", "decoupled_lr"], ["Index(index=4)", "decoupled_min_lr"], ["Index(index=5)", "weight_decay"], ["Index(index=6)", "fp16"], ["Index(index=7)", "bf16"], ["Index(index=8)", "params_dtype"], ["Index(index=9)", "use_precision_aware_optimizer"], ["Index(index=10)", "main_grads_dtype"], ["Index(index=11)", "main_params_dtype"], ["Index(index=12)", "exp_avg_dtype"], ["Index(index=13)", "exp_avg_sq_dtype"], ["Index(index=14)", "loss_scale"], ["Index(index=15)", "initial_loss_scale"], ["Index(index=16)", "min_loss_scale"], ["Index(index=17)", "loss_scale_window"], ["Index(index=18)", "hysteresis"], ["Index(index=19)", "adam_beta1"], ["Index(index=20)", "adam_beta2"], ["Index(index=21)", "adam_eps"], ["Index(index=22)", "sgd_momentum"], ["Index(index=23)", "use_distributed_optimizer"], ["Index(index=24)", "overlap_param_gather_with_optimizer_step"], ["Index(index=25)", "clip_grad"], ["Index(index=26)", "log_num_zeros_in_grad"], ["Index(index=27)", "barrier_with_L1_time"], ["Index(index=28)", "timers"], ["Index(index=29)", "config_logger_dir"]], "metadata": null}, "dict_3": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_4": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.optimizer.optimizer_config", "name": "OptimizerConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_2"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_3"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_4"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "optimizer_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='optimizer')", {"type": "leaf", "value": "adam", "paths": ["<root>.model.optim.config.optimizer"]}], ["Attr(name='lr')", {"type": "leaf", "value": 0.0003, "paths": ["<root>.model.optim.config.lr"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.min_lr"]}], ["Attr(name='decoupled_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_lr"]}], ["Attr(name='decoupled_min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_min_lr"]}], ["Attr(name='weight_decay')", {"type": "leaf", "value": 0.1, "paths": ["<root>.model.optim.config.weight_decay"]}], ["Attr(name='fp16')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.fp16"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='use_precision_aware_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.use_precision_aware_optimizer"]}], ["Attr(name='main_grads_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='main_params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_sq_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='loss_scale')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.loss_scale"]}], ["Attr(name='initial_loss_scale')", {"type": "leaf", "value": 4294967296, "paths": ["<root>.model.optim.config.initial_loss_scale"]}], ["Attr(name='min_loss_scale')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.min_loss_scale"]}], ["Attr(name='loss_scale_window')", {"type": "leaf", "value": 1000, "paths": ["<root>.model.optim.config.loss_scale_window"]}], ["Attr(name='hysteresis')", {"type": "leaf", "value": 2, "paths": ["<root>.model.optim.config.hysteresis"]}], ["Attr(name='adam_beta1')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.adam_beta1"]}], ["Attr(name='adam_beta2')", {"type": "leaf", "value": 0.95, "paths": ["<root>.model.optim.config.adam_beta2"]}], ["Attr(name='adam_eps')", {"type": "leaf", "value": 1e-05, "paths": ["<root>.model.optim.config.adam_eps"]}], ["Attr(name='sgd_momentum')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.sgd_momentum"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.use_distributed_optimizer"]}], ["Attr(name='overlap_param_gather_with_optimizer_step')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.overlap_param_gather_with_optimizer_step"]}], ["Attr(name='clip_grad')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.clip_grad"]}], ["Attr(name='log_num_zeros_in_grad')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.log_num_zeros_in_grad"]}], ["Attr(name='barrier_with_L1_time')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.barrier_with_L1_time"]}], ["Attr(name='timers')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.timers"]}], ["Attr(name='config_logger_dir')", {"type": "leaf", "value": "", "paths": ["<root>.model.optim.config.config_logger_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_2"}, "paths": ["<root>.model.optim.config"]}, "tuple_3": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "warmup_steps"], ["Index(index=1)", "constant_steps"], ["Index(index=2)", "min_lr"]], "metadata": null}, "dict_5": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_6": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_3": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.lr_scheduler", "name": "CosineAnnealingScheduler"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_3"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_5"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_6"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "cosine_annealing_scheduler_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='warmup_steps')", {"type": "leaf", "value": 2000, "paths": ["<root>.model.optim.lr_scheduler.warmup_steps"]}], ["Attr(name='constant_steps')", {"type": "leaf", "value": 0, "paths": ["<root>.model.optim.lr_scheduler.constant_steps"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": 2.9999999999999997e-05, "paths": ["<root>.model.optim.lr_scheduler.min_lr"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_3"}, "paths": ["<root>.model.optim.lr_scheduler"]}, "tuple_4": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "lr_scheduler"]], "metadata": null}, "dict_7": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_8": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_4": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.megatron", "name": "MegatronOptimizerModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_4"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_7"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_8"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_optimizer_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "optimizer_config_1"}], ["Attr(name='lr_scheduler')", {"type": "ref", "key": "cosine_annealing_scheduler_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_4"}, "paths": ["<root>.model.optim"]}, "tuple_5": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "attr"], ["Index(index=1)", "skip"]], "metadata": null}, "dict_9": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_10": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_5": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.artifact.file", "name": "DirOrStringArtifact"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_5"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_9"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_10"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "dir_or_string_artifact_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='attr')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.model.tokenizer.pretrained_model_name.attr"]}], ["Attr(name='skip')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.pretrained_model_name.skip"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_5"}, "paths": ["<root>.model.tokenizer.pretrained_model_name"]}, "tuple_6": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_11": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_12": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_6": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_6"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_11"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_12"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "ref", "key": "dir_or_string_artifact_1"}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "tokenizer_config.json", "paths": ["<root>.model.tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_6"}, "paths": ["<root>.model.tokenizer"]}, "tuple_7": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "optim"], ["Index(index=2)", "tokenizer"]], "metadata": null}, "dict_13": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_14": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_7": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "LlamaModel"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_7"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_13"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_14"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama_model_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "llama32_config1_b_1"}], ["Attr(name='optim')", {"type": "ref", "key": "megatron_optimizer_module_1"}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_7"}, "paths": ["<root>.model"]}, "tuple_8": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "grad_reduce_in_fp32"], ["Index(index=1)", "overlap_grad_reduce"], ["Index(index=2)", "overlap_param_gather"], ["Index(index=3)", "align_param_gather"], ["Index(index=4)", "use_distributed_optimizer"], ["Index(index=5)", "num_distributed_optimizer_instances"], ["Index(index=6)", "check_for_nan_in_grad"], ["Index(index=7)", "bucket_size"], ["Index(index=8)", "average_in_collective"], ["Index(index=9)", "fp8_param_gather"]], "metadata": null}, "dict_15": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_16": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_8": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.distributed.distributed_data_parallel_config", "name": "DistributedDataParallelConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_8"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_15"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_16"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "distributed_data_parallel_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='grad_reduce_in_fp32')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.grad_reduce_in_fp32"]}], ["Attr(name='overlap_grad_reduce')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_grad_reduce"]}], ["Attr(name='overlap_param_gather')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_param_gather"]}], ["Attr(name='align_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.align_param_gather"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.use_distributed_optimizer"]}], ["Attr(name='num_distributed_optimizer_instances')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.ddp.num_distributed_optimizer_instances"]}], ["Attr(name='check_for_nan_in_grad')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.check_for_nan_in_grad"]}], ["Attr(name='bucket_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.ddp.bucket_size"]}], ["Attr(name='average_in_collective')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.average_in_collective"]}], ["Attr(name='fp8_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.fp8_param_gather"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_8"}, "paths": ["<root>.trainer.strategy.ddp"]}, "tuple_9": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "gradient_as_bucket_view"]], "metadata": null}, "dict_17": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='gradient_as_bucket_view')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.kwargs['gradient_as_bucket_view']"]}]], "metadata": {"type": "ref", "key": "tuple_9"}, "paths": ["<root>.trainer.strategy.kwargs"]}, "tuple_10": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "context_parallel_size"], ["Index(index=4)", "sequence_parallel"], ["Index(index=5)", "ddp"], ["Index(index=6)", "pipeline_dtype"], ["Index(index=7)", "ckpt_async_save"], ["Index(index=8)", "ckpt_parallel_load"], ["Index(index=9)", "kwargs"]], "metadata": null}, "dict_18": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_19": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_9": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.strategies.megatron_strategy", "name": "MegatronStrategy"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_10"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_18"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_19"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_strategy_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.virtual_pipeline_model_parallel_size"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.context_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.sequence_parallel"]}], ["Attr(name='ddp')", {"type": "ref", "key": "distributed_data_parallel_config_1"}], ["Attr(name='pipeline_dtype')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.pipeline_dtype"]}], ["Attr(name='ckpt_async_save')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_async_save"]}], ["Attr(name='ckpt_parallel_load')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_parallel_load"]}], ["Attr(name='kwargs')", {"type": "ref", "key": "dict_17"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_9"}, "paths": ["<root>.trainer.strategy"]}, "timing_callback_1": {"type": {"type": "pyref", "module": "nemo.utils.exp_manager", "name": "TimingCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "458204eb-6487-422a-8674-610fd32a94cd", "paths": ["<root>.trainer.callbacks[0]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[0]"]}, "garbage_collection_callback_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.callbacks.garbage_collection", "name": "GarbageCollectionCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "e5e4999a-9ee0-4a4e-9cf6-c4f09d01bec2", "paths": ["<root>.trainer.callbacks[1]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[1]"]}, "list_1": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "ref", "key": "timing_callback_1"}], ["Index(index=1)", {"type": "ref", "key": "garbage_collection_callback_1"}]], "metadata": null, "paths": ["<root>.trainer.callbacks"]}, "megatron_mixed_precision_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.plugins.mixed_precision", "name": "MegatronMixedPrecision"}, "items": [["IdentityElement()", {"type": "leaf", "value": "365df6c4-8992-4ca2-aaa5-ef3123ca40ed", "paths": ["<root>.trainer.plugins"]}]], "metadata": null, "paths": ["<root>.trainer.plugins"]}, "tuple_11": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "accelerator"], ["Index(index=1)", "strategy"], ["Index(index=2)", "devices"], ["Index(index=3)", "num_nodes"], ["Index(index=4)", "callbacks"], ["Index(index=5)", "max_steps"], ["Index(index=6)", "limit_val_batches"], ["Index(index=7)", "val_check_interval"], ["Index(index=8)", "log_every_n_steps"], ["Index(index=9)", "accumulate_grad_batches"], ["Index(index=10)", "use_distributed_sampler"], ["Index(index=11)", "plugins"]], "metadata": null}, "dict_20": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_21": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_10": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.trainer", "name": "Trainer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_11"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_20"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_21"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='accelerator')", {"type": "leaf", "value": "gpu", "paths": ["<root>.trainer.accelerator"]}], ["Attr(name='strategy')", {"type": "ref", "key": "megatron_strategy_1"}], ["Attr(name='devices')", {"type": "leaf", "value": 8, "paths": ["<root>.trainer.devices"]}], ["Attr(name='num_nodes')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.num_nodes"]}], ["Attr(name='callbacks')", {"type": "ref", "key": "list_1"}], ["Attr(name='max_steps')", {"type": "leaf", "value": 1168251, "paths": ["<root>.trainer.max_steps"]}], ["Attr(name='limit_val_batches')", {"type": "leaf", "value": 32, "paths": ["<root>.trainer.limit_val_batches"]}], ["Attr(name='val_check_interval')", {"type": "leaf", "value": 100, "paths": ["<root>.trainer.val_check_interval"]}], ["Attr(name='log_every_n_steps')", {"type": "leaf", "value": 10, "paths": ["<root>.trainer.log_every_n_steps"]}], ["Attr(name='accumulate_grad_batches')", {"type": "leaf", "value": 4, "paths": ["<root>.trainer.accumulate_grad_batches"]}], ["Attr(name='use_distributed_sampler')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.use_distributed_sampler"]}], ["Attr(name='plugins')", {"type": "ref", "key": "megatron_mixed_precision_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_10"}, "paths": ["<root>.trainer"]}, "list_2": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "leaf", "value": "Data/dclm_local_shard_1_megatron/concatenated.jsonl_text_document", "paths": ["<root>.extra['datamodule'].paths[0]"]}]], "metadata": null, "paths": ["<root>.extra['datamodule'].paths"]}, "tuple_12": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_22": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_23": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_11": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_12"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_22"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_23"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.extra['datamodule'].tokenizer.pretrained_model_name"]}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "Data/tokenizer/tokenizer_config.json", "paths": ["<root>.extra['datamodule'].tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.extra['datamodule'].tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_11"}, "paths": ["<root>.extra['datamodule'].tokenizer"]}, "tuple_13": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "paths"], ["Index(index=1)", "seq_length"], ["Index(index=2)", "tokenizer"], ["Index(index=3)", "micro_batch_size"], ["Index(index=4)", "global_batch_size"], ["Index(index=5)", "split"], ["Index(index=6)", "index_mapping_dir"]], "metadata": null}, "dict_24": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_25": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_12": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.data.pre_training", "name": "PreTrainingDataModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_13"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_24"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_25"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "pre_training_data_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='paths')", {"type": "ref", "key": "list_2"}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.extra['datamodule'].seq_length"]}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_2"}], ["Attr(name='micro_batch_size')", {"type": "leaf", "value": 16, "paths": ["<root>.extra['datamodule'].micro_batch_size"]}], ["Attr(name='global_batch_size')", {"type": "leaf", "value": 512, "paths": ["<root>.extra['datamodule'].global_batch_size"]}], ["Attr(name='split')", {"type": "leaf", "value": "99,8,2", "paths": ["<root>.extra['datamodule'].split"]}], ["Attr(name='index_mapping_dir')", {"type": "leaf", "value": "Data/index_mapping_local_shard_1", "paths": ["<root>.extra['datamodule'].index_mapping_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_12"}, "paths": ["<root>.extra['datamodule']"]}, "tuple_14": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "datamodule"]], "metadata": null}, "dict_26": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='datamodule')", {"type": "ref", "key": "pre_training_data_module_1"}]], "metadata": {"type": "ref", "key": "tuple_14"}, "paths": ["<root>.extra"]}, "tuple_15": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "model"], ["Index(index=1)", "trainer"], ["Index(index=2)", "extra"]], "metadata": null}, "dict_27": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_28": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_13": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.pl", "name": "TrainerContext"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_15"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_27"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_28"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_context_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='model')", {"type": "ref", "key": "llama_model_1"}], ["Attr(name='trainer')", {"type": "ref", "key": "trainer_1"}], ["Attr(name='extra')", {"type": "ref", "key": "dict_26"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_13"}, "paths": ["<root>"]}}, "refcounts": {"tuple_1": 1, "dict_1": 1, "dict_2": 1, "buildable_traverser_metadata_1": 1, "llama32_config1_b_1": 1, "tuple_2": 1, "dict_3": 1, "dict_4": 1, "buildable_traverser_metadata_2": 1, "optimizer_config_1": 1, "tuple_3": 1, "dict_5": 1, "dict_6": 1, "buildable_traverser_metadata_3": 1, "cosine_annealing_scheduler_1": 1, "tuple_4": 1, "dict_7": 1, "dict_8": 1, "buildable_traverser_metadata_4": 1, "megatron_optimizer_module_1": 1, "tuple_5": 1, "dict_9": 1, "dict_10": 1, "buildable_traverser_metadata_5": 1, "dir_or_string_artifact_1": 1, "tuple_6": 1, "dict_11": 1, "dict_12": 1, "buildable_traverser_metadata_6": 1, "auto_tokenizer_1": 1, "tuple_7": 1, "dict_13": 1, "dict_14": 1, "buildable_traverser_metadata_7": 1, "llama_model_1": 1, "tuple_8": 1, "dict_15": 1, "dict_16": 1, "buildable_traverser_metadata_8": 1, "distributed_data_parallel_config_1": 1, "tuple_9": 1, "dict_17": 1, "tuple_10": 1, "dict_18": 1, "dict_19": 1, "buildable_traverser_metadata_9": 1, "megatron_strategy_1": 1, "timing_callback_1": 1, "garbage_collection_callback_1": 1, "list_1": 1, "megatron_mixed_precision_1": 1, "tuple_11": 1, "dict_20": 1, "dict_21": 1, "buildable_traverser_metadata_10": 1, "trainer_1": 1, "list_2": 1, "tuple_12": 1, "dict_22": 1, "dict_23": 1, "buildable_traverser_metadata_11": 1, "auto_tokenizer_2": 1, "tuple_13": 1, "dict_24": 1, "dict_25": 1, "buildable_traverser_metadata_12": 1, "pre_training_data_module_1": 1, "tuple_14": 1, "dict_26": 1, "tuple_15": 1, "dict_27": 1, "dict_28": 1, "buildable_traverser_metadata_13": 1, "trainer_context_1": 1}, "version": "0.0.1"}
|
model_name=0--step=1099-consumed_samples=563200.0/context/model.yaml
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_target_: nemo.collections.llm.gpt.model.llama.LlamaModel
|
2 |
+
config:
|
3 |
+
_cpu_offloading_context: null
|
4 |
+
_target_: nemo.collections.llm.gpt.model.llama.Llama32Config1B
|
5 |
+
account_for_embedding_in_pipeline_split: false
|
6 |
+
account_for_loss_in_pipeline_split: false
|
7 |
+
activation_func:
|
8 |
+
_call_: false
|
9 |
+
_target_: torch.nn.functional.silu
|
10 |
+
activation_func_fp8_input_store: false
|
11 |
+
add_bias_linear: false
|
12 |
+
add_qkv_bias: false
|
13 |
+
apply_query_key_layer_scaling: false
|
14 |
+
apply_residual_connection_post_layernorm: false
|
15 |
+
apply_rope_fusion: true
|
16 |
+
async_tensor_model_parallel_allreduce: false
|
17 |
+
attention_backend:
|
18 |
+
_call_: true
|
19 |
+
_target_: megatron.core.transformer.enums.AttnBackend
|
20 |
+
attention_dropout: 0.0
|
21 |
+
attention_softmax_in_fp32: false
|
22 |
+
autocast_dtype:
|
23 |
+
_call_: false
|
24 |
+
_target_: torch.bfloat16
|
25 |
+
barrier_with_L1_time: true
|
26 |
+
batch_p2p_comm: true
|
27 |
+
batch_p2p_sync: true
|
28 |
+
bf16: true
|
29 |
+
bias_activation_fusion: true
|
30 |
+
bias_dropout_fusion: true
|
31 |
+
calculate_per_token_loss: false
|
32 |
+
clone_scatter_output_in_embedding: true
|
33 |
+
config_logger_dir: ''
|
34 |
+
context_parallel_size: 1
|
35 |
+
cp_comm_type: null
|
36 |
+
cpu_offloading: false
|
37 |
+
cpu_offloading_activations: true
|
38 |
+
cpu_offloading_num_layers: 0
|
39 |
+
cpu_offloading_weights: true
|
40 |
+
cross_entropy_loss_fusion: true
|
41 |
+
cuda_graph_retain_backward_graph: false
|
42 |
+
cuda_graph_use_single_mempool: false
|
43 |
+
cuda_graph_warmup_steps: 3
|
44 |
+
data_step_fn:
|
45 |
+
_call_: false
|
46 |
+
_target_: nemo.collections.llm.gpt.model.base.gpt_data_step
|
47 |
+
deallocate_pipeline_outputs: true
|
48 |
+
defer_embedding_wgrad_compute: false
|
49 |
+
deterministic_mode: false
|
50 |
+
disable_parameter_transpose_cache: false
|
51 |
+
distribute_saved_activations: null
|
52 |
+
enable_autocast: false
|
53 |
+
enable_cuda_graph: false
|
54 |
+
expert_model_parallel_size: 1
|
55 |
+
expert_tensor_parallel_size: null
|
56 |
+
external_cuda_graph: false
|
57 |
+
ffn_hidden_size: 8192
|
58 |
+
finalize_model_grads_func: null
|
59 |
+
flash_decode: false
|
60 |
+
forward_step_fn:
|
61 |
+
_call_: false
|
62 |
+
_target_: nemo.collections.llm.gpt.model.base.gpt_forward_step
|
63 |
+
fp16: false
|
64 |
+
fp16_lm_cross_entropy: false
|
65 |
+
fp32_residual_connection: false
|
66 |
+
fp8: null
|
67 |
+
fp8_amax_compute_algo: most_recent
|
68 |
+
fp8_amax_history_len: 1
|
69 |
+
fp8_dot_product_attention: false
|
70 |
+
fp8_interval: 1
|
71 |
+
fp8_margin: 0
|
72 |
+
fp8_multi_head_attention: false
|
73 |
+
fp8_wgrad: true
|
74 |
+
gated_linear_unit: true
|
75 |
+
grad_scale_func: null
|
76 |
+
grad_sync_func: null
|
77 |
+
gradient_accumulation_fusion: true
|
78 |
+
hidden_dropout: 0.0
|
79 |
+
hidden_size: 2048
|
80 |
+
hierarchical_context_parallel_sizes: null
|
81 |
+
high_freq_factor: 4
|
82 |
+
inference_rng_tracker: false
|
83 |
+
init_method: null
|
84 |
+
init_method_std: 0.02
|
85 |
+
kv_channels: null
|
86 |
+
layernorm_epsilon: 1.0e-05
|
87 |
+
layernorm_zero_centered_gamma: false
|
88 |
+
low_freq_factor: 1
|
89 |
+
make_vocab_size_divisible_by: 128
|
90 |
+
masked_softmax_fusion: true
|
91 |
+
memory_efficient_layer_norm: false
|
92 |
+
microbatch_group_size_per_vp_stage: 1
|
93 |
+
moe_aux_loss_coeff: 0
|
94 |
+
moe_expert_capacity_factor: null
|
95 |
+
moe_extended_tp: false
|
96 |
+
moe_ffn_hidden_size: null
|
97 |
+
moe_grouped_gemm: false
|
98 |
+
moe_input_jitter_eps: null
|
99 |
+
moe_layer_freq: 1
|
100 |
+
moe_layer_recompute: false
|
101 |
+
moe_pad_expert_input_to_capacity: false
|
102 |
+
moe_per_layer_logging: false
|
103 |
+
moe_permute_fusion: false
|
104 |
+
moe_router_bias_update_rate: 0.001
|
105 |
+
moe_router_enable_expert_bias: false
|
106 |
+
moe_router_group_topk: null
|
107 |
+
moe_router_load_balancing_type: aux_loss
|
108 |
+
moe_router_num_groups: null
|
109 |
+
moe_router_pre_softmax: false
|
110 |
+
moe_router_score_function: softmax
|
111 |
+
moe_router_topk: 2
|
112 |
+
moe_router_topk_limited_devices: null
|
113 |
+
moe_router_topk_scaling_factor: null
|
114 |
+
moe_shared_expert_intermediate_size: null
|
115 |
+
moe_shared_expert_overlap: false
|
116 |
+
moe_token_dispatcher_type: allgather
|
117 |
+
moe_token_drop_policy: probs
|
118 |
+
moe_token_dropping: false
|
119 |
+
moe_use_legacy_grouped_gemm: false
|
120 |
+
moe_z_loss_coeff: null
|
121 |
+
multi_latent_attention: false
|
122 |
+
no_sync_func: null
|
123 |
+
normalization: RMSNorm
|
124 |
+
num_attention_heads: 32
|
125 |
+
num_layers: 16
|
126 |
+
num_layers_in_first_pipeline_stage: null
|
127 |
+
num_layers_in_last_pipeline_stage: null
|
128 |
+
num_microbatches_with_partial_activation_checkpoints: null
|
129 |
+
num_moe_experts: null
|
130 |
+
num_query_groups: 8
|
131 |
+
old_context_len: 8192
|
132 |
+
output_layer_init_method: null
|
133 |
+
overlap_p2p_comm: false
|
134 |
+
overlap_p2p_comm_warmup_flush: false
|
135 |
+
parallel_output: true
|
136 |
+
param_sync_func: null
|
137 |
+
params_dtype:
|
138 |
+
_call_: false
|
139 |
+
_target_: torch.bfloat16
|
140 |
+
perform_initialization: true
|
141 |
+
persist_layer_norm: true
|
142 |
+
pipeline_dtype:
|
143 |
+
_call_: false
|
144 |
+
_target_: torch.bfloat16
|
145 |
+
pipeline_model_parallel_size: 1
|
146 |
+
pipeline_model_parallel_split_rank: null
|
147 |
+
position_embedding_type: rope
|
148 |
+
qk_layernorm: false
|
149 |
+
recompute_granularity: null
|
150 |
+
recompute_method: null
|
151 |
+
recompute_num_layers: null
|
152 |
+
rotary_base: 500000
|
153 |
+
rotary_interleaved: false
|
154 |
+
rotary_percent: 1.0
|
155 |
+
scale_factor: 32
|
156 |
+
scatter_embedding_sequence_parallel: true
|
157 |
+
seq_len_interpolation_factor: null
|
158 |
+
seq_length: 2048
|
159 |
+
sequence_parallel: false
|
160 |
+
share_embeddings_and_output_weights: true
|
161 |
+
softmax_scale: null
|
162 |
+
tensor_model_parallel_size: 1
|
163 |
+
test_mode: false
|
164 |
+
timers: null
|
165 |
+
tp_comm_atomic_ag: false
|
166 |
+
tp_comm_atomic_rs: false
|
167 |
+
tp_comm_bootstrap_backend: nccl
|
168 |
+
tp_comm_bulk_dgrad: true
|
169 |
+
tp_comm_bulk_wgrad: true
|
170 |
+
tp_comm_overlap: false
|
171 |
+
tp_comm_overlap_ag: true
|
172 |
+
tp_comm_overlap_disable_fc1: false
|
173 |
+
tp_comm_overlap_disable_qkv: false
|
174 |
+
tp_comm_overlap_rs: true
|
175 |
+
tp_comm_overlap_rs_dgrad: false
|
176 |
+
tp_comm_split_ag: true
|
177 |
+
tp_comm_split_rs: true
|
178 |
+
tp_only_amax_red: false
|
179 |
+
transformer_layer_spec:
|
180 |
+
_call_: false
|
181 |
+
_target_: nemo.collections.llm.gpt.model.base.default_layer_spec
|
182 |
+
use_cpu_initialization: false
|
183 |
+
use_ring_exchange_p2p: false
|
184 |
+
use_te_rng_tracker: false
|
185 |
+
use_transformer_engine_full_layer_spec: false
|
186 |
+
variable_seq_lengths: false
|
187 |
+
virtual_pipeline_model_parallel_size: null
|
188 |
+
wgrad_deferral_limit: 0
|
189 |
+
window_size: null
|
190 |
+
model_transform: null
|
191 |
+
optim:
|
192 |
+
_target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
|
193 |
+
config:
|
194 |
+
_target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
|
195 |
+
adam_beta1: 0.9
|
196 |
+
adam_beta2: 0.95
|
197 |
+
adam_eps: 1.0e-05
|
198 |
+
barrier_with_L1_time: false
|
199 |
+
bf16: true
|
200 |
+
clip_grad: 1.0
|
201 |
+
config_logger_dir: ''
|
202 |
+
decoupled_lr: null
|
203 |
+
decoupled_min_lr: null
|
204 |
+
exp_avg_dtype:
|
205 |
+
_call_: false
|
206 |
+
_target_: torch.float32
|
207 |
+
exp_avg_sq_dtype:
|
208 |
+
_call_: false
|
209 |
+
_target_: torch.float32
|
210 |
+
fp16: false
|
211 |
+
hysteresis: 2
|
212 |
+
initial_loss_scale: 4294967296
|
213 |
+
log_num_zeros_in_grad: false
|
214 |
+
loss_scale: null
|
215 |
+
loss_scale_window: 1000
|
216 |
+
lr: 0.0003
|
217 |
+
main_grads_dtype:
|
218 |
+
_call_: false
|
219 |
+
_target_: torch.float32
|
220 |
+
main_params_dtype:
|
221 |
+
_call_: false
|
222 |
+
_target_: torch.float32
|
223 |
+
min_loss_scale: 1.0
|
224 |
+
min_lr: null
|
225 |
+
optimizer: adam
|
226 |
+
overlap_param_gather_with_optimizer_step: false
|
227 |
+
params_dtype:
|
228 |
+
_call_: false
|
229 |
+
_target_: torch.float32
|
230 |
+
sgd_momentum: 0.9
|
231 |
+
timers: null
|
232 |
+
use_distributed_optimizer: true
|
233 |
+
use_precision_aware_optimizer: false
|
234 |
+
weight_decay: 0.1
|
235 |
+
lr_mult: 1.0
|
236 |
+
lr_scheduler:
|
237 |
+
_target_: nemo.lightning.pytorch.optim.lr_scheduler.CosineAnnealingScheduler
|
238 |
+
constant_steps: 0
|
239 |
+
frequency: 1
|
240 |
+
interval: step
|
241 |
+
max_steps: 10
|
242 |
+
min_lr: 2.9999999999999997e-05
|
243 |
+
monitor: val_loss
|
244 |
+
warmup_steps: 2000
|
245 |
+
no_weight_decay_cond: null
|
246 |
+
scale_lr_cond: null
|
247 |
+
tokenizer:
|
248 |
+
_target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
|
249 |
+
additional_special_tokens: []
|
250 |
+
bos_token: null
|
251 |
+
cls_token: null
|
252 |
+
eos_token: null
|
253 |
+
include_special_tokens: false
|
254 |
+
mask_token: null
|
255 |
+
merges_file: null
|
256 |
+
pad_token: null
|
257 |
+
pretrained_model_name:
|
258 |
+
_target_: nemo.lightning.io.artifact.file.DirOrStringArtifact
|
259 |
+
attr: allenai/OLMo-1B-hf
|
260 |
+
required: true
|
261 |
+
skip: true
|
262 |
+
sep_token: null
|
263 |
+
trust_remote_code: false
|
264 |
+
unk_token: null
|
265 |
+
use_fast: true
|
266 |
+
vocab_file: tokenizer_config.json
|
model_name=0--step=1099-consumed_samples=563200.0/context/tokenizer_config.json
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"add_prefix_space": false,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "|||IP_ADDRESS|||",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": true,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": false
|
13 |
+
},
|
14 |
+
"1": {
|
15 |
+
"content": "<|padding|>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": false,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"50254": {
|
23 |
+
"content": " ",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": true,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": false
|
29 |
+
},
|
30 |
+
"50255": {
|
31 |
+
"content": " ",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": true,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false,
|
36 |
+
"special": false
|
37 |
+
},
|
38 |
+
"50256": {
|
39 |
+
"content": " ",
|
40 |
+
"lstrip": false,
|
41 |
+
"normalized": true,
|
42 |
+
"rstrip": false,
|
43 |
+
"single_word": false,
|
44 |
+
"special": false
|
45 |
+
},
|
46 |
+
"50257": {
|
47 |
+
"content": " ",
|
48 |
+
"lstrip": false,
|
49 |
+
"normalized": true,
|
50 |
+
"rstrip": false,
|
51 |
+
"single_word": false,
|
52 |
+
"special": false
|
53 |
+
},
|
54 |
+
"50258": {
|
55 |
+
"content": " ",
|
56 |
+
"lstrip": false,
|
57 |
+
"normalized": true,
|
58 |
+
"rstrip": false,
|
59 |
+
"single_word": false,
|
60 |
+
"special": false
|
61 |
+
},
|
62 |
+
"50259": {
|
63 |
+
"content": " ",
|
64 |
+
"lstrip": false,
|
65 |
+
"normalized": true,
|
66 |
+
"rstrip": false,
|
67 |
+
"single_word": false,
|
68 |
+
"special": false
|
69 |
+
},
|
70 |
+
"50260": {
|
71 |
+
"content": " ",
|
72 |
+
"lstrip": false,
|
73 |
+
"normalized": true,
|
74 |
+
"rstrip": false,
|
75 |
+
"single_word": false,
|
76 |
+
"special": false
|
77 |
+
},
|
78 |
+
"50261": {
|
79 |
+
"content": " ",
|
80 |
+
"lstrip": false,
|
81 |
+
"normalized": true,
|
82 |
+
"rstrip": false,
|
83 |
+
"single_word": false,
|
84 |
+
"special": false
|
85 |
+
},
|
86 |
+
"50262": {
|
87 |
+
"content": " ",
|
88 |
+
"lstrip": false,
|
89 |
+
"normalized": true,
|
90 |
+
"rstrip": false,
|
91 |
+
"single_word": false,
|
92 |
+
"special": false
|
93 |
+
},
|
94 |
+
"50263": {
|
95 |
+
"content": " ",
|
96 |
+
"lstrip": false,
|
97 |
+
"normalized": true,
|
98 |
+
"rstrip": false,
|
99 |
+
"single_word": false,
|
100 |
+
"special": false
|
101 |
+
},
|
102 |
+
"50264": {
|
103 |
+
"content": " ",
|
104 |
+
"lstrip": false,
|
105 |
+
"normalized": true,
|
106 |
+
"rstrip": false,
|
107 |
+
"single_word": false,
|
108 |
+
"special": false
|
109 |
+
},
|
110 |
+
"50265": {
|
111 |
+
"content": " ",
|
112 |
+
"lstrip": false,
|
113 |
+
"normalized": true,
|
114 |
+
"rstrip": false,
|
115 |
+
"single_word": false,
|
116 |
+
"special": false
|
117 |
+
},
|
118 |
+
"50266": {
|
119 |
+
"content": " ",
|
120 |
+
"lstrip": false,
|
121 |
+
"normalized": true,
|
122 |
+
"rstrip": false,
|
123 |
+
"single_word": false,
|
124 |
+
"special": false
|
125 |
+
},
|
126 |
+
"50267": {
|
127 |
+
"content": " ",
|
128 |
+
"lstrip": false,
|
129 |
+
"normalized": true,
|
130 |
+
"rstrip": false,
|
131 |
+
"single_word": false,
|
132 |
+
"special": false
|
133 |
+
},
|
134 |
+
"50268": {
|
135 |
+
"content": " ",
|
136 |
+
"lstrip": false,
|
137 |
+
"normalized": true,
|
138 |
+
"rstrip": false,
|
139 |
+
"single_word": false,
|
140 |
+
"special": false
|
141 |
+
},
|
142 |
+
"50269": {
|
143 |
+
"content": " ",
|
144 |
+
"lstrip": false,
|
145 |
+
"normalized": true,
|
146 |
+
"rstrip": false,
|
147 |
+
"single_word": false,
|
148 |
+
"special": false
|
149 |
+
},
|
150 |
+
"50270": {
|
151 |
+
"content": " ",
|
152 |
+
"lstrip": false,
|
153 |
+
"normalized": true,
|
154 |
+
"rstrip": false,
|
155 |
+
"single_word": false,
|
156 |
+
"special": false
|
157 |
+
},
|
158 |
+
"50271": {
|
159 |
+
"content": " ",
|
160 |
+
"lstrip": false,
|
161 |
+
"normalized": true,
|
162 |
+
"rstrip": false,
|
163 |
+
"single_word": false,
|
164 |
+
"special": false
|
165 |
+
},
|
166 |
+
"50272": {
|
167 |
+
"content": " ",
|
168 |
+
"lstrip": false,
|
169 |
+
"normalized": true,
|
170 |
+
"rstrip": false,
|
171 |
+
"single_word": false,
|
172 |
+
"special": false
|
173 |
+
},
|
174 |
+
"50273": {
|
175 |
+
"content": " ",
|
176 |
+
"lstrip": false,
|
177 |
+
"normalized": true,
|
178 |
+
"rstrip": false,
|
179 |
+
"single_word": false,
|
180 |
+
"special": false
|
181 |
+
},
|
182 |
+
"50274": {
|
183 |
+
"content": " ",
|
184 |
+
"lstrip": false,
|
185 |
+
"normalized": true,
|
186 |
+
"rstrip": false,
|
187 |
+
"single_word": false,
|
188 |
+
"special": false
|
189 |
+
},
|
190 |
+
"50275": {
|
191 |
+
"content": " ",
|
192 |
+
"lstrip": false,
|
193 |
+
"normalized": true,
|
194 |
+
"rstrip": false,
|
195 |
+
"single_word": false,
|
196 |
+
"special": false
|
197 |
+
},
|
198 |
+
"50276": {
|
199 |
+
"content": " ",
|
200 |
+
"lstrip": false,
|
201 |
+
"normalized": true,
|
202 |
+
"rstrip": false,
|
203 |
+
"single_word": false,
|
204 |
+
"special": false
|
205 |
+
},
|
206 |
+
"50277": {
|
207 |
+
"content": "|||EMAIL_ADDRESS|||",
|
208 |
+
"lstrip": false,
|
209 |
+
"normalized": true,
|
210 |
+
"rstrip": false,
|
211 |
+
"single_word": false,
|
212 |
+
"special": false
|
213 |
+
},
|
214 |
+
"50278": {
|
215 |
+
"content": "|||PHONE_NUMBER|||",
|
216 |
+
"lstrip": false,
|
217 |
+
"normalized": true,
|
218 |
+
"rstrip": false,
|
219 |
+
"single_word": false,
|
220 |
+
"special": false
|
221 |
+
},
|
222 |
+
"50279": {
|
223 |
+
"content": "<|endoftext|>",
|
224 |
+
"lstrip": false,
|
225 |
+
"normalized": false,
|
226 |
+
"rstrip": false,
|
227 |
+
"single_word": false,
|
228 |
+
"special": true
|
229 |
+
}
|
230 |
+
},
|
231 |
+
"bos_token": null,
|
232 |
+
"clean_up_tokenization_spaces": true,
|
233 |
+
"eos_token": "<|endoftext|>",
|
234 |
+
"model_max_length": 1000000000000000019884624838656,
|
235 |
+
"pad_token": "<|padding|>",
|
236 |
+
"tokenizer_class": "GPTNeoXTokenizer",
|
237 |
+
"unk_token": null
|
238 |
+
}
|
model_name=0--step=1099-consumed_samples=563200.0/weights/.metadata
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:facd341a3e9283f9a4f7fe31f8e95313741b788a6377ca9a81f7102b042f7154
|
3 |
+
size 272080
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__0_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9861184d7da94d0b8acc516c3892b3205a4ceb2ac52ba2903468c93552fafa86
|
3 |
+
size 938897288
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__0_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7414d97f91efa62d89626cf7b725721aa87e47b13ecea0347a55b81dc64e32ea
|
3 |
+
size 940460984
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__1_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f75bdd68e507974b4dab6870b1882b4deaf1123fd0474996e9a6cc65be7a51f
|
3 |
+
size 938851248
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__1_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:777f0d65325e91dff2f33de550fb21b28a230e1c77d851df6d96ef552538d2e2
|
3 |
+
size 943962344
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__2_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d6f5f453d1072064a1407fdba699100eaf2c286172a1b2ad1e3217c42000814
|
3 |
+
size 937781988
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__2_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:444ca8a8af0d375ecef8a3a470f1a26ff0371731dee22f95a87b5da8b704adc2
|
3 |
+
size 944982044
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__3_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d60be0f7adf398e02e66b4c9e02f1a8e256d1bbafac0cda34b000b06701fe92
|
3 |
+
size 943054924
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__3_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8cc34615658513009c2d179e64e6133774acb0722e1c3e8469068b918224a5d1
|
3 |
+
size 943954152
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__4_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35953e1d75fb6016408144f244a87d98766ebc87ae96528dc3882933354b5a4c
|
3 |
+
size 941969992
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__4_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a1fc983a7b9fd845438ecdf174e03c2f24fadb64b3ede68b5dec91848af0025
|
3 |
+
size 944985984
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__5_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1088efade9e2fc9e2af8c05d832fd254be1b858d33da03727d32a8e810b492c
|
3 |
+
size 941995240
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__5_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:297084c3892884b1511e0e27db58bd45e73d8282fe52bc4dceee02ae1cb4d257
|
3 |
+
size 945017376
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__6_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1cc3912061a5526b8951b0001a03bfcdae356915a5d35238571a670fb28f8040
|
3 |
+
size 941969992
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__6_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fcaec0a208bec64828d1ae8e930af484009c11a032f2f5a7d97d060334e1f143
|
3 |
+
size 936770304
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__7_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd5a5a74c2e63a5fd4ded355a51e2484d9dd0bbd04a21b65cf2e1783551664c7
|
3 |
+
size 938826468
|
model_name=0--step=1099-consumed_samples=563200.0/weights/__7_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e0528102c26e46362071c25bccc1319b319b4f8171973c4c8f752a0d8429500e
|
3 |
+
size 943936384
|
model_name=0--step=1099-consumed_samples=563200.0/weights/common.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b48888cd6697727c63c214a3d3bc7382484a10625143e009f0b4b4ef222e4c1
|
3 |
+
size 9314
|
model_name=0--step=1099-consumed_samples=563200.0/weights/metadata.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
|
model_name=0--step=1199-consumed_samples=614400.0/context/7f55e7bc-67d0-43b6-a099-17d6faf84264
ADDED
Binary file (584 Bytes). View file
|
|
model_name=0--step=1199-consumed_samples=614400.0/context/bcb2d43c-8276-4afb-b1da-8ef068679e7c
ADDED
Binary file (173 Bytes). View file
|
|
model_name=0--step=1199-consumed_samples=614400.0/context/d0be63b8-65ec-4740-b706-58027d280788
ADDED
Binary file (202 Bytes). View file
|
|
model_name=0--step=1199-consumed_samples=614400.0/context/io.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"root": {"type": "ref", "key": "trainer_context_1"}, "objects": {"tuple_1": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "sequence_parallel"], ["Index(index=4)", "context_parallel_size"], ["Index(index=5)", "expert_model_parallel_size"], ["Index(index=6)", "expert_tensor_parallel_size"], ["Index(index=7)", "moe_extended_tp"], ["Index(index=8)", "bf16"], ["Index(index=9)", "params_dtype"], ["Index(index=10)", "autocast_dtype"], ["Index(index=11)", "use_te_rng_tracker"], ["Index(index=12)", "pipeline_dtype"], ["Index(index=13)", "microbatch_group_size_per_vp_stage"], ["Index(index=14)", "account_for_embedding_in_pipeline_split"], ["Index(index=15)", "account_for_loss_in_pipeline_split"], ["Index(index=16)", "share_embeddings_and_output_weights"], ["Index(index=17)", "seq_length"]], "metadata": null}, "dict_1": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_2": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "Llama32Config1B"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_1"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_1"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_2"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama32_config1_b_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.virtual_pipeline_model_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.sequence_parallel"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.context_parallel_size"]}], ["Attr(name='expert_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.expert_model_parallel_size"]}], ["Attr(name='expert_tensor_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.model.config.expert_tensor_parallel_size"]}], ["Attr(name='moe_extended_tp')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.moe_extended_tp"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='autocast_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='use_te_rng_tracker')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.use_te_rng_tracker"]}], ["Attr(name='pipeline_dtype')", {"type": "pyref", "module": "torch", "name": "bfloat16", "paths": ["<root>.model.config.params_dtype", "<root>.model.config.autocast_dtype", "<root>.model.config.pipeline_dtype"]}], ["Attr(name='microbatch_group_size_per_vp_stage')", {"type": "leaf", "value": 1, "paths": ["<root>.model.config.microbatch_group_size_per_vp_stage"]}], ["Attr(name='account_for_embedding_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_embedding_in_pipeline_split"]}], ["Attr(name='account_for_loss_in_pipeline_split')", {"type": "leaf", "value": false, "paths": ["<root>.model.config.account_for_loss_in_pipeline_split"]}], ["Attr(name='share_embeddings_and_output_weights')", {"type": "leaf", "value": true, "paths": ["<root>.model.config.share_embeddings_and_output_weights"]}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.model.config.seq_length"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_1"}, "paths": ["<root>.model.config"]}, "tuple_2": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "optimizer"], ["Index(index=1)", "lr"], ["Index(index=2)", "min_lr"], ["Index(index=3)", "decoupled_lr"], ["Index(index=4)", "decoupled_min_lr"], ["Index(index=5)", "weight_decay"], ["Index(index=6)", "fp16"], ["Index(index=7)", "bf16"], ["Index(index=8)", "params_dtype"], ["Index(index=9)", "use_precision_aware_optimizer"], ["Index(index=10)", "main_grads_dtype"], ["Index(index=11)", "main_params_dtype"], ["Index(index=12)", "exp_avg_dtype"], ["Index(index=13)", "exp_avg_sq_dtype"], ["Index(index=14)", "loss_scale"], ["Index(index=15)", "initial_loss_scale"], ["Index(index=16)", "min_loss_scale"], ["Index(index=17)", "loss_scale_window"], ["Index(index=18)", "hysteresis"], ["Index(index=19)", "adam_beta1"], ["Index(index=20)", "adam_beta2"], ["Index(index=21)", "adam_eps"], ["Index(index=22)", "sgd_momentum"], ["Index(index=23)", "use_distributed_optimizer"], ["Index(index=24)", "overlap_param_gather_with_optimizer_step"], ["Index(index=25)", "clip_grad"], ["Index(index=26)", "log_num_zeros_in_grad"], ["Index(index=27)", "barrier_with_L1_time"], ["Index(index=28)", "timers"], ["Index(index=29)", "config_logger_dir"]], "metadata": null}, "dict_3": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_4": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.optimizer.optimizer_config", "name": "OptimizerConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_2"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_3"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_4"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "optimizer_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='optimizer')", {"type": "leaf", "value": "adam", "paths": ["<root>.model.optim.config.optimizer"]}], ["Attr(name='lr')", {"type": "leaf", "value": 0.0003, "paths": ["<root>.model.optim.config.lr"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.min_lr"]}], ["Attr(name='decoupled_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_lr"]}], ["Attr(name='decoupled_min_lr')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.decoupled_min_lr"]}], ["Attr(name='weight_decay')", {"type": "leaf", "value": 0.1, "paths": ["<root>.model.optim.config.weight_decay"]}], ["Attr(name='fp16')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.fp16"]}], ["Attr(name='bf16')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.bf16"]}], ["Attr(name='params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='use_precision_aware_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.use_precision_aware_optimizer"]}], ["Attr(name='main_grads_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='main_params_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='exp_avg_sq_dtype')", {"type": "pyref", "module": "torch", "name": "float32", "paths": ["<root>.model.optim.config.params_dtype", "<root>.model.optim.config.main_grads_dtype", "<root>.model.optim.config.main_params_dtype", "<root>.model.optim.config.exp_avg_dtype", "<root>.model.optim.config.exp_avg_sq_dtype"]}], ["Attr(name='loss_scale')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.loss_scale"]}], ["Attr(name='initial_loss_scale')", {"type": "leaf", "value": 4294967296, "paths": ["<root>.model.optim.config.initial_loss_scale"]}], ["Attr(name='min_loss_scale')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.min_loss_scale"]}], ["Attr(name='loss_scale_window')", {"type": "leaf", "value": 1000, "paths": ["<root>.model.optim.config.loss_scale_window"]}], ["Attr(name='hysteresis')", {"type": "leaf", "value": 2, "paths": ["<root>.model.optim.config.hysteresis"]}], ["Attr(name='adam_beta1')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.adam_beta1"]}], ["Attr(name='adam_beta2')", {"type": "leaf", "value": 0.95, "paths": ["<root>.model.optim.config.adam_beta2"]}], ["Attr(name='adam_eps')", {"type": "leaf", "value": 1e-05, "paths": ["<root>.model.optim.config.adam_eps"]}], ["Attr(name='sgd_momentum')", {"type": "leaf", "value": 0.9, "paths": ["<root>.model.optim.config.sgd_momentum"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": true, "paths": ["<root>.model.optim.config.use_distributed_optimizer"]}], ["Attr(name='overlap_param_gather_with_optimizer_step')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.overlap_param_gather_with_optimizer_step"]}], ["Attr(name='clip_grad')", {"type": "leaf", "value": 1.0, "paths": ["<root>.model.optim.config.clip_grad"]}], ["Attr(name='log_num_zeros_in_grad')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.log_num_zeros_in_grad"]}], ["Attr(name='barrier_with_L1_time')", {"type": "leaf", "value": false, "paths": ["<root>.model.optim.config.barrier_with_L1_time"]}], ["Attr(name='timers')", {"type": "leaf", "value": null, "paths": ["<root>.model.optim.config.timers"]}], ["Attr(name='config_logger_dir')", {"type": "leaf", "value": "", "paths": ["<root>.model.optim.config.config_logger_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_2"}, "paths": ["<root>.model.optim.config"]}, "tuple_3": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "warmup_steps"], ["Index(index=1)", "constant_steps"], ["Index(index=2)", "min_lr"]], "metadata": null}, "dict_5": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_6": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_3": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.lr_scheduler", "name": "CosineAnnealingScheduler"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_3"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_5"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_6"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "cosine_annealing_scheduler_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='warmup_steps')", {"type": "leaf", "value": 2000, "paths": ["<root>.model.optim.lr_scheduler.warmup_steps"]}], ["Attr(name='constant_steps')", {"type": "leaf", "value": 0, "paths": ["<root>.model.optim.lr_scheduler.constant_steps"]}], ["Attr(name='min_lr')", {"type": "leaf", "value": 2.9999999999999997e-05, "paths": ["<root>.model.optim.lr_scheduler.min_lr"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_3"}, "paths": ["<root>.model.optim.lr_scheduler"]}, "tuple_4": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "lr_scheduler"]], "metadata": null}, "dict_7": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_8": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_4": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.optim.megatron", "name": "MegatronOptimizerModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_4"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_7"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_8"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_optimizer_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "optimizer_config_1"}], ["Attr(name='lr_scheduler')", {"type": "ref", "key": "cosine_annealing_scheduler_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_4"}, "paths": ["<root>.model.optim"]}, "tuple_5": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "attr"], ["Index(index=1)", "skip"]], "metadata": null}, "dict_9": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_10": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_5": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.artifact.file", "name": "DirOrStringArtifact"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_5"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_9"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_10"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "dir_or_string_artifact_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='attr')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.model.tokenizer.pretrained_model_name.attr"]}], ["Attr(name='skip')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.pretrained_model_name.skip"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_5"}, "paths": ["<root>.model.tokenizer.pretrained_model_name"]}, "tuple_6": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_11": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_12": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_6": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_6"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_11"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_12"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "ref", "key": "dir_or_string_artifact_1"}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "tokenizer_config.json", "paths": ["<root>.model.tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.model.tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_6"}, "paths": ["<root>.model.tokenizer"]}, "tuple_7": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "config"], ["Index(index=1)", "optim"], ["Index(index=2)", "tokenizer"]], "metadata": null}, "dict_13": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_14": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_7": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.model.llama", "name": "LlamaModel"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_7"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_13"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_14"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "llama_model_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='config')", {"type": "ref", "key": "llama32_config1_b_1"}], ["Attr(name='optim')", {"type": "ref", "key": "megatron_optimizer_module_1"}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_7"}, "paths": ["<root>.model"]}, "tuple_8": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "grad_reduce_in_fp32"], ["Index(index=1)", "overlap_grad_reduce"], ["Index(index=2)", "overlap_param_gather"], ["Index(index=3)", "align_param_gather"], ["Index(index=4)", "use_distributed_optimizer"], ["Index(index=5)", "num_distributed_optimizer_instances"], ["Index(index=6)", "check_for_nan_in_grad"], ["Index(index=7)", "bucket_size"], ["Index(index=8)", "average_in_collective"], ["Index(index=9)", "fp8_param_gather"]], "metadata": null}, "dict_15": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_16": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_8": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "megatron.core.distributed.distributed_data_parallel_config", "name": "DistributedDataParallelConfig"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_8"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_15"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_16"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "distributed_data_parallel_config_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='grad_reduce_in_fp32')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.grad_reduce_in_fp32"]}], ["Attr(name='overlap_grad_reduce')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_grad_reduce"]}], ["Attr(name='overlap_param_gather')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.overlap_param_gather"]}], ["Attr(name='align_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.align_param_gather"]}], ["Attr(name='use_distributed_optimizer')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.use_distributed_optimizer"]}], ["Attr(name='num_distributed_optimizer_instances')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.ddp.num_distributed_optimizer_instances"]}], ["Attr(name='check_for_nan_in_grad')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.check_for_nan_in_grad"]}], ["Attr(name='bucket_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.ddp.bucket_size"]}], ["Attr(name='average_in_collective')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ddp.average_in_collective"]}], ["Attr(name='fp8_param_gather')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.ddp.fp8_param_gather"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_8"}, "paths": ["<root>.trainer.strategy.ddp"]}, "tuple_9": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "gradient_as_bucket_view"]], "metadata": null}, "dict_17": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='gradient_as_bucket_view')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.kwargs['gradient_as_bucket_view']"]}]], "metadata": {"type": "ref", "key": "tuple_9"}, "paths": ["<root>.trainer.strategy.kwargs"]}, "tuple_10": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "tensor_model_parallel_size"], ["Index(index=1)", "pipeline_model_parallel_size"], ["Index(index=2)", "virtual_pipeline_model_parallel_size"], ["Index(index=3)", "context_parallel_size"], ["Index(index=4)", "sequence_parallel"], ["Index(index=5)", "ddp"], ["Index(index=6)", "pipeline_dtype"], ["Index(index=7)", "ckpt_async_save"], ["Index(index=8)", "ckpt_parallel_load"], ["Index(index=9)", "kwargs"]], "metadata": null}, "dict_18": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_19": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_9": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.strategies.megatron_strategy", "name": "MegatronStrategy"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_10"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_18"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_19"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "megatron_strategy_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='tensor_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.tensor_model_parallel_size"]}], ["Attr(name='pipeline_model_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.pipeline_model_parallel_size"]}], ["Attr(name='virtual_pipeline_model_parallel_size')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.virtual_pipeline_model_parallel_size"]}], ["Attr(name='context_parallel_size')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.strategy.context_parallel_size"]}], ["Attr(name='sequence_parallel')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.strategy.sequence_parallel"]}], ["Attr(name='ddp')", {"type": "ref", "key": "distributed_data_parallel_config_1"}], ["Attr(name='pipeline_dtype')", {"type": "leaf", "value": null, "paths": ["<root>.trainer.strategy.pipeline_dtype"]}], ["Attr(name='ckpt_async_save')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_async_save"]}], ["Attr(name='ckpt_parallel_load')", {"type": "leaf", "value": true, "paths": ["<root>.trainer.strategy.ckpt_parallel_load"]}], ["Attr(name='kwargs')", {"type": "ref", "key": "dict_17"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_9"}, "paths": ["<root>.trainer.strategy"]}, "timing_callback_1": {"type": {"type": "pyref", "module": "nemo.utils.exp_manager", "name": "TimingCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "d0be63b8-65ec-4740-b706-58027d280788", "paths": ["<root>.trainer.callbacks[0]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[0]"]}, "garbage_collection_callback_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.callbacks.garbage_collection", "name": "GarbageCollectionCallback"}, "items": [["IdentityElement()", {"type": "leaf", "value": "bcb2d43c-8276-4afb-b1da-8ef068679e7c", "paths": ["<root>.trainer.callbacks[1]"]}]], "metadata": null, "paths": ["<root>.trainer.callbacks[1]"]}, "list_1": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "ref", "key": "timing_callback_1"}], ["Index(index=1)", {"type": "ref", "key": "garbage_collection_callback_1"}]], "metadata": null, "paths": ["<root>.trainer.callbacks"]}, "megatron_mixed_precision_1": {"type": {"type": "pyref", "module": "nemo.lightning.pytorch.plugins.mixed_precision", "name": "MegatronMixedPrecision"}, "items": [["IdentityElement()", {"type": "leaf", "value": "7f55e7bc-67d0-43b6-a099-17d6faf84264", "paths": ["<root>.trainer.plugins"]}]], "metadata": null, "paths": ["<root>.trainer.plugins"]}, "tuple_11": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "accelerator"], ["Index(index=1)", "strategy"], ["Index(index=2)", "devices"], ["Index(index=3)", "num_nodes"], ["Index(index=4)", "callbacks"], ["Index(index=5)", "max_steps"], ["Index(index=6)", "limit_val_batches"], ["Index(index=7)", "val_check_interval"], ["Index(index=8)", "log_every_n_steps"], ["Index(index=9)", "accumulate_grad_batches"], ["Index(index=10)", "use_distributed_sampler"], ["Index(index=11)", "plugins"]], "metadata": null}, "dict_20": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_21": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_10": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.pytorch.trainer", "name": "Trainer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_11"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_20"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_21"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='accelerator')", {"type": "leaf", "value": "gpu", "paths": ["<root>.trainer.accelerator"]}], ["Attr(name='strategy')", {"type": "ref", "key": "megatron_strategy_1"}], ["Attr(name='devices')", {"type": "leaf", "value": 8, "paths": ["<root>.trainer.devices"]}], ["Attr(name='num_nodes')", {"type": "leaf", "value": 1, "paths": ["<root>.trainer.num_nodes"]}], ["Attr(name='callbacks')", {"type": "ref", "key": "list_1"}], ["Attr(name='max_steps')", {"type": "leaf", "value": 1168251, "paths": ["<root>.trainer.max_steps"]}], ["Attr(name='limit_val_batches')", {"type": "leaf", "value": 32, "paths": ["<root>.trainer.limit_val_batches"]}], ["Attr(name='val_check_interval')", {"type": "leaf", "value": 100, "paths": ["<root>.trainer.val_check_interval"]}], ["Attr(name='log_every_n_steps')", {"type": "leaf", "value": 10, "paths": ["<root>.trainer.log_every_n_steps"]}], ["Attr(name='accumulate_grad_batches')", {"type": "leaf", "value": 4, "paths": ["<root>.trainer.accumulate_grad_batches"]}], ["Attr(name='use_distributed_sampler')", {"type": "leaf", "value": false, "paths": ["<root>.trainer.use_distributed_sampler"]}], ["Attr(name='plugins')", {"type": "ref", "key": "megatron_mixed_precision_1"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_10"}, "paths": ["<root>.trainer"]}, "list_2": {"type": {"type": "pyref", "module": "builtins", "name": "list"}, "items": [["Index(index=0)", {"type": "leaf", "value": "Data/dclm_local_shard_1_megatron/concatenated.jsonl_text_document", "paths": ["<root>.extra['datamodule'].paths[0]"]}]], "metadata": null, "paths": ["<root>.extra['datamodule'].paths"]}, "tuple_12": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "pretrained_model_name"], ["Index(index=1)", "vocab_file"], ["Index(index=2)", "use_fast"]], "metadata": null}, "dict_22": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_23": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_11": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.common.tokenizers.huggingface.auto_tokenizer", "name": "AutoTokenizer"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_12"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_22"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_23"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "auto_tokenizer_2": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='pretrained_model_name')", {"type": "leaf", "value": "allenai/OLMo-1B-hf", "paths": ["<root>.extra['datamodule'].tokenizer.pretrained_model_name"]}], ["Attr(name='vocab_file')", {"type": "leaf", "value": "Data/tokenizer/tokenizer_config.json", "paths": ["<root>.extra['datamodule'].tokenizer.vocab_file"]}], ["Attr(name='use_fast')", {"type": "leaf", "value": true, "paths": ["<root>.extra['datamodule'].tokenizer.use_fast"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_11"}, "paths": ["<root>.extra['datamodule'].tokenizer"]}, "tuple_13": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "paths"], ["Index(index=1)", "seq_length"], ["Index(index=2)", "tokenizer"], ["Index(index=3)", "micro_batch_size"], ["Index(index=4)", "global_batch_size"], ["Index(index=5)", "split"], ["Index(index=6)", "index_mapping_dir"]], "metadata": null}, "dict_24": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_25": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_12": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.collections.llm.gpt.data.pre_training", "name": "PreTrainingDataModule"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_13"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_24"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_25"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "pre_training_data_module_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='paths')", {"type": "ref", "key": "list_2"}], ["Attr(name='seq_length')", {"type": "leaf", "value": 2048, "paths": ["<root>.extra['datamodule'].seq_length"]}], ["Attr(name='tokenizer')", {"type": "ref", "key": "auto_tokenizer_2"}], ["Attr(name='micro_batch_size')", {"type": "leaf", "value": 16, "paths": ["<root>.extra['datamodule'].micro_batch_size"]}], ["Attr(name='global_batch_size')", {"type": "leaf", "value": 512, "paths": ["<root>.extra['datamodule'].global_batch_size"]}], ["Attr(name='split')", {"type": "leaf", "value": "99,8,2", "paths": ["<root>.extra['datamodule'].split"]}], ["Attr(name='index_mapping_dir')", {"type": "leaf", "value": "Data/index_mapping_local_shard_1", "paths": ["<root>.extra['datamodule'].index_mapping_dir"]}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_12"}, "paths": ["<root>.extra['datamodule']"]}, "tuple_14": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "datamodule"]], "metadata": null}, "dict_26": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [["Key(key='datamodule')", {"type": "ref", "key": "pre_training_data_module_1"}]], "metadata": {"type": "ref", "key": "tuple_14"}, "paths": ["<root>.extra"]}, "tuple_15": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [["Index(index=0)", "model"], ["Index(index=1)", "trainer"], ["Index(index=2)", "extra"]], "metadata": null}, "dict_27": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "dict_28": {"type": {"type": "pyref", "module": "builtins", "name": "dict"}, "items": [], "metadata": {"type": {"type": "pyref", "module": "builtins", "name": "tuple"}, "items": [], "metadata": null}}, "buildable_traverser_metadata_13": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}, "items": [["Attr(name='fn_or_cls')", {"type": "pyref", "module": "nemo.lightning.io.pl", "name": "TrainerContext"}], ["Attr(name='argument_names')", {"type": "ref", "key": "tuple_15"}], ["Attr(name='argument_tags')", {"type": "ref", "key": "dict_27"}], ["Attr(name='argument_history')", {"type": "ref", "key": "dict_28"}]], "metadata": {"type": "pyref", "module": "fiddle._src.config", "name": "BuildableTraverserMetadata"}}, "trainer_context_1": {"type": {"type": "pyref", "module": "fiddle._src.config", "name": "Config"}, "items": [["Attr(name='model')", {"type": "ref", "key": "llama_model_1"}], ["Attr(name='trainer')", {"type": "ref", "key": "trainer_1"}], ["Attr(name='extra')", {"type": "ref", "key": "dict_26"}]], "metadata": {"type": "ref", "key": "buildable_traverser_metadata_13"}, "paths": ["<root>"]}}, "refcounts": {"tuple_1": 1, "dict_1": 1, "dict_2": 1, "buildable_traverser_metadata_1": 1, "llama32_config1_b_1": 1, "tuple_2": 1, "dict_3": 1, "dict_4": 1, "buildable_traverser_metadata_2": 1, "optimizer_config_1": 1, "tuple_3": 1, "dict_5": 1, "dict_6": 1, "buildable_traverser_metadata_3": 1, "cosine_annealing_scheduler_1": 1, "tuple_4": 1, "dict_7": 1, "dict_8": 1, "buildable_traverser_metadata_4": 1, "megatron_optimizer_module_1": 1, "tuple_5": 1, "dict_9": 1, "dict_10": 1, "buildable_traverser_metadata_5": 1, "dir_or_string_artifact_1": 1, "tuple_6": 1, "dict_11": 1, "dict_12": 1, "buildable_traverser_metadata_6": 1, "auto_tokenizer_1": 1, "tuple_7": 1, "dict_13": 1, "dict_14": 1, "buildable_traverser_metadata_7": 1, "llama_model_1": 1, "tuple_8": 1, "dict_15": 1, "dict_16": 1, "buildable_traverser_metadata_8": 1, "distributed_data_parallel_config_1": 1, "tuple_9": 1, "dict_17": 1, "tuple_10": 1, "dict_18": 1, "dict_19": 1, "buildable_traverser_metadata_9": 1, "megatron_strategy_1": 1, "timing_callback_1": 1, "garbage_collection_callback_1": 1, "list_1": 1, "megatron_mixed_precision_1": 1, "tuple_11": 1, "dict_20": 1, "dict_21": 1, "buildable_traverser_metadata_10": 1, "trainer_1": 1, "list_2": 1, "tuple_12": 1, "dict_22": 1, "dict_23": 1, "buildable_traverser_metadata_11": 1, "auto_tokenizer_2": 1, "tuple_13": 1, "dict_24": 1, "dict_25": 1, "buildable_traverser_metadata_12": 1, "pre_training_data_module_1": 1, "tuple_14": 1, "dict_26": 1, "tuple_15": 1, "dict_27": 1, "dict_28": 1, "buildable_traverser_metadata_13": 1, "trainer_context_1": 1}, "version": "0.0.1"}
|
model_name=0--step=1199-consumed_samples=614400.0/context/model.yaml
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_target_: nemo.collections.llm.gpt.model.llama.LlamaModel
|
2 |
+
config:
|
3 |
+
_cpu_offloading_context: null
|
4 |
+
_target_: nemo.collections.llm.gpt.model.llama.Llama32Config1B
|
5 |
+
account_for_embedding_in_pipeline_split: false
|
6 |
+
account_for_loss_in_pipeline_split: false
|
7 |
+
activation_func:
|
8 |
+
_call_: false
|
9 |
+
_target_: torch.nn.functional.silu
|
10 |
+
activation_func_fp8_input_store: false
|
11 |
+
add_bias_linear: false
|
12 |
+
add_qkv_bias: false
|
13 |
+
apply_query_key_layer_scaling: false
|
14 |
+
apply_residual_connection_post_layernorm: false
|
15 |
+
apply_rope_fusion: true
|
16 |
+
async_tensor_model_parallel_allreduce: false
|
17 |
+
attention_backend:
|
18 |
+
_call_: true
|
19 |
+
_target_: megatron.core.transformer.enums.AttnBackend
|
20 |
+
attention_dropout: 0.0
|
21 |
+
attention_softmax_in_fp32: false
|
22 |
+
autocast_dtype:
|
23 |
+
_call_: false
|
24 |
+
_target_: torch.bfloat16
|
25 |
+
barrier_with_L1_time: true
|
26 |
+
batch_p2p_comm: true
|
27 |
+
batch_p2p_sync: true
|
28 |
+
bf16: true
|
29 |
+
bias_activation_fusion: true
|
30 |
+
bias_dropout_fusion: true
|
31 |
+
calculate_per_token_loss: false
|
32 |
+
clone_scatter_output_in_embedding: true
|
33 |
+
config_logger_dir: ''
|
34 |
+
context_parallel_size: 1
|
35 |
+
cp_comm_type: null
|
36 |
+
cpu_offloading: false
|
37 |
+
cpu_offloading_activations: true
|
38 |
+
cpu_offloading_num_layers: 0
|
39 |
+
cpu_offloading_weights: true
|
40 |
+
cross_entropy_loss_fusion: true
|
41 |
+
cuda_graph_retain_backward_graph: false
|
42 |
+
cuda_graph_use_single_mempool: false
|
43 |
+
cuda_graph_warmup_steps: 3
|
44 |
+
data_step_fn:
|
45 |
+
_call_: false
|
46 |
+
_target_: nemo.collections.llm.gpt.model.base.gpt_data_step
|
47 |
+
deallocate_pipeline_outputs: true
|
48 |
+
defer_embedding_wgrad_compute: false
|
49 |
+
deterministic_mode: false
|
50 |
+
disable_parameter_transpose_cache: false
|
51 |
+
distribute_saved_activations: null
|
52 |
+
enable_autocast: false
|
53 |
+
enable_cuda_graph: false
|
54 |
+
expert_model_parallel_size: 1
|
55 |
+
expert_tensor_parallel_size: null
|
56 |
+
external_cuda_graph: false
|
57 |
+
ffn_hidden_size: 8192
|
58 |
+
finalize_model_grads_func: null
|
59 |
+
flash_decode: false
|
60 |
+
forward_step_fn:
|
61 |
+
_call_: false
|
62 |
+
_target_: nemo.collections.llm.gpt.model.base.gpt_forward_step
|
63 |
+
fp16: false
|
64 |
+
fp16_lm_cross_entropy: false
|
65 |
+
fp32_residual_connection: false
|
66 |
+
fp8: null
|
67 |
+
fp8_amax_compute_algo: most_recent
|
68 |
+
fp8_amax_history_len: 1
|
69 |
+
fp8_dot_product_attention: false
|
70 |
+
fp8_interval: 1
|
71 |
+
fp8_margin: 0
|
72 |
+
fp8_multi_head_attention: false
|
73 |
+
fp8_wgrad: true
|
74 |
+
gated_linear_unit: true
|
75 |
+
grad_scale_func: null
|
76 |
+
grad_sync_func: null
|
77 |
+
gradient_accumulation_fusion: true
|
78 |
+
hidden_dropout: 0.0
|
79 |
+
hidden_size: 2048
|
80 |
+
hierarchical_context_parallel_sizes: null
|
81 |
+
high_freq_factor: 4
|
82 |
+
inference_rng_tracker: false
|
83 |
+
init_method: null
|
84 |
+
init_method_std: 0.02
|
85 |
+
kv_channels: null
|
86 |
+
layernorm_epsilon: 1.0e-05
|
87 |
+
layernorm_zero_centered_gamma: false
|
88 |
+
low_freq_factor: 1
|
89 |
+
make_vocab_size_divisible_by: 128
|
90 |
+
masked_softmax_fusion: true
|
91 |
+
memory_efficient_layer_norm: false
|
92 |
+
microbatch_group_size_per_vp_stage: 1
|
93 |
+
moe_aux_loss_coeff: 0
|
94 |
+
moe_expert_capacity_factor: null
|
95 |
+
moe_extended_tp: false
|
96 |
+
moe_ffn_hidden_size: null
|
97 |
+
moe_grouped_gemm: false
|
98 |
+
moe_input_jitter_eps: null
|
99 |
+
moe_layer_freq: 1
|
100 |
+
moe_layer_recompute: false
|
101 |
+
moe_pad_expert_input_to_capacity: false
|
102 |
+
moe_per_layer_logging: false
|
103 |
+
moe_permute_fusion: false
|
104 |
+
moe_router_bias_update_rate: 0.001
|
105 |
+
moe_router_enable_expert_bias: false
|
106 |
+
moe_router_group_topk: null
|
107 |
+
moe_router_load_balancing_type: aux_loss
|
108 |
+
moe_router_num_groups: null
|
109 |
+
moe_router_pre_softmax: false
|
110 |
+
moe_router_score_function: softmax
|
111 |
+
moe_router_topk: 2
|
112 |
+
moe_router_topk_limited_devices: null
|
113 |
+
moe_router_topk_scaling_factor: null
|
114 |
+
moe_shared_expert_intermediate_size: null
|
115 |
+
moe_shared_expert_overlap: false
|
116 |
+
moe_token_dispatcher_type: allgather
|
117 |
+
moe_token_drop_policy: probs
|
118 |
+
moe_token_dropping: false
|
119 |
+
moe_use_legacy_grouped_gemm: false
|
120 |
+
moe_z_loss_coeff: null
|
121 |
+
multi_latent_attention: false
|
122 |
+
no_sync_func: null
|
123 |
+
normalization: RMSNorm
|
124 |
+
num_attention_heads: 32
|
125 |
+
num_layers: 16
|
126 |
+
num_layers_in_first_pipeline_stage: null
|
127 |
+
num_layers_in_last_pipeline_stage: null
|
128 |
+
num_microbatches_with_partial_activation_checkpoints: null
|
129 |
+
num_moe_experts: null
|
130 |
+
num_query_groups: 8
|
131 |
+
old_context_len: 8192
|
132 |
+
output_layer_init_method: null
|
133 |
+
overlap_p2p_comm: false
|
134 |
+
overlap_p2p_comm_warmup_flush: false
|
135 |
+
parallel_output: true
|
136 |
+
param_sync_func: null
|
137 |
+
params_dtype:
|
138 |
+
_call_: false
|
139 |
+
_target_: torch.bfloat16
|
140 |
+
perform_initialization: true
|
141 |
+
persist_layer_norm: true
|
142 |
+
pipeline_dtype:
|
143 |
+
_call_: false
|
144 |
+
_target_: torch.bfloat16
|
145 |
+
pipeline_model_parallel_size: 1
|
146 |
+
pipeline_model_parallel_split_rank: null
|
147 |
+
position_embedding_type: rope
|
148 |
+
qk_layernorm: false
|
149 |
+
recompute_granularity: null
|
150 |
+
recompute_method: null
|
151 |
+
recompute_num_layers: null
|
152 |
+
rotary_base: 500000
|
153 |
+
rotary_interleaved: false
|
154 |
+
rotary_percent: 1.0
|
155 |
+
scale_factor: 32
|
156 |
+
scatter_embedding_sequence_parallel: true
|
157 |
+
seq_len_interpolation_factor: null
|
158 |
+
seq_length: 2048
|
159 |
+
sequence_parallel: false
|
160 |
+
share_embeddings_and_output_weights: true
|
161 |
+
softmax_scale: null
|
162 |
+
tensor_model_parallel_size: 1
|
163 |
+
test_mode: false
|
164 |
+
timers: null
|
165 |
+
tp_comm_atomic_ag: false
|
166 |
+
tp_comm_atomic_rs: false
|
167 |
+
tp_comm_bootstrap_backend: nccl
|
168 |
+
tp_comm_bulk_dgrad: true
|
169 |
+
tp_comm_bulk_wgrad: true
|
170 |
+
tp_comm_overlap: false
|
171 |
+
tp_comm_overlap_ag: true
|
172 |
+
tp_comm_overlap_disable_fc1: false
|
173 |
+
tp_comm_overlap_disable_qkv: false
|
174 |
+
tp_comm_overlap_rs: true
|
175 |
+
tp_comm_overlap_rs_dgrad: false
|
176 |
+
tp_comm_split_ag: true
|
177 |
+
tp_comm_split_rs: true
|
178 |
+
tp_only_amax_red: false
|
179 |
+
transformer_layer_spec:
|
180 |
+
_call_: false
|
181 |
+
_target_: nemo.collections.llm.gpt.model.base.default_layer_spec
|
182 |
+
use_cpu_initialization: false
|
183 |
+
use_ring_exchange_p2p: false
|
184 |
+
use_te_rng_tracker: false
|
185 |
+
use_transformer_engine_full_layer_spec: false
|
186 |
+
variable_seq_lengths: false
|
187 |
+
virtual_pipeline_model_parallel_size: null
|
188 |
+
wgrad_deferral_limit: 0
|
189 |
+
window_size: null
|
190 |
+
model_transform: null
|
191 |
+
optim:
|
192 |
+
_target_: nemo.lightning.pytorch.optim.megatron.MegatronOptimizerModule
|
193 |
+
config:
|
194 |
+
_target_: megatron.core.optimizer.optimizer_config.OptimizerConfig
|
195 |
+
adam_beta1: 0.9
|
196 |
+
adam_beta2: 0.95
|
197 |
+
adam_eps: 1.0e-05
|
198 |
+
barrier_with_L1_time: false
|
199 |
+
bf16: true
|
200 |
+
clip_grad: 1.0
|
201 |
+
config_logger_dir: ''
|
202 |
+
decoupled_lr: null
|
203 |
+
decoupled_min_lr: null
|
204 |
+
exp_avg_dtype:
|
205 |
+
_call_: false
|
206 |
+
_target_: torch.float32
|
207 |
+
exp_avg_sq_dtype:
|
208 |
+
_call_: false
|
209 |
+
_target_: torch.float32
|
210 |
+
fp16: false
|
211 |
+
hysteresis: 2
|
212 |
+
initial_loss_scale: 4294967296
|
213 |
+
log_num_zeros_in_grad: false
|
214 |
+
loss_scale: null
|
215 |
+
loss_scale_window: 1000
|
216 |
+
lr: 0.0003
|
217 |
+
main_grads_dtype:
|
218 |
+
_call_: false
|
219 |
+
_target_: torch.float32
|
220 |
+
main_params_dtype:
|
221 |
+
_call_: false
|
222 |
+
_target_: torch.float32
|
223 |
+
min_loss_scale: 1.0
|
224 |
+
min_lr: null
|
225 |
+
optimizer: adam
|
226 |
+
overlap_param_gather_with_optimizer_step: false
|
227 |
+
params_dtype:
|
228 |
+
_call_: false
|
229 |
+
_target_: torch.float32
|
230 |
+
sgd_momentum: 0.9
|
231 |
+
timers: null
|
232 |
+
use_distributed_optimizer: true
|
233 |
+
use_precision_aware_optimizer: false
|
234 |
+
weight_decay: 0.1
|
235 |
+
lr_mult: 1.0
|
236 |
+
lr_scheduler:
|
237 |
+
_target_: nemo.lightning.pytorch.optim.lr_scheduler.CosineAnnealingScheduler
|
238 |
+
constant_steps: 0
|
239 |
+
frequency: 1
|
240 |
+
interval: step
|
241 |
+
max_steps: 10
|
242 |
+
min_lr: 2.9999999999999997e-05
|
243 |
+
monitor: val_loss
|
244 |
+
warmup_steps: 2000
|
245 |
+
no_weight_decay_cond: null
|
246 |
+
scale_lr_cond: null
|
247 |
+
tokenizer:
|
248 |
+
_target_: nemo.collections.common.tokenizers.huggingface.auto_tokenizer.AutoTokenizer
|
249 |
+
additional_special_tokens: []
|
250 |
+
bos_token: null
|
251 |
+
cls_token: null
|
252 |
+
eos_token: null
|
253 |
+
include_special_tokens: false
|
254 |
+
mask_token: null
|
255 |
+
merges_file: null
|
256 |
+
pad_token: null
|
257 |
+
pretrained_model_name:
|
258 |
+
_target_: nemo.lightning.io.artifact.file.DirOrStringArtifact
|
259 |
+
attr: allenai/OLMo-1B-hf
|
260 |
+
required: true
|
261 |
+
skip: true
|
262 |
+
sep_token: null
|
263 |
+
trust_remote_code: false
|
264 |
+
unk_token: null
|
265 |
+
use_fast: true
|
266 |
+
vocab_file: tokenizer_config.json
|
model_name=0--step=1199-consumed_samples=614400.0/context/tokenizer_config.json
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": false,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"add_prefix_space": false,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "|||IP_ADDRESS|||",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": true,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": false
|
13 |
+
},
|
14 |
+
"1": {
|
15 |
+
"content": "<|padding|>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": false,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"50254": {
|
23 |
+
"content": " ",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": true,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": false
|
29 |
+
},
|
30 |
+
"50255": {
|
31 |
+
"content": " ",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": true,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false,
|
36 |
+
"special": false
|
37 |
+
},
|
38 |
+
"50256": {
|
39 |
+
"content": " ",
|
40 |
+
"lstrip": false,
|
41 |
+
"normalized": true,
|
42 |
+
"rstrip": false,
|
43 |
+
"single_word": false,
|
44 |
+
"special": false
|
45 |
+
},
|
46 |
+
"50257": {
|
47 |
+
"content": " ",
|
48 |
+
"lstrip": false,
|
49 |
+
"normalized": true,
|
50 |
+
"rstrip": false,
|
51 |
+
"single_word": false,
|
52 |
+
"special": false
|
53 |
+
},
|
54 |
+
"50258": {
|
55 |
+
"content": " ",
|
56 |
+
"lstrip": false,
|
57 |
+
"normalized": true,
|
58 |
+
"rstrip": false,
|
59 |
+
"single_word": false,
|
60 |
+
"special": false
|
61 |
+
},
|
62 |
+
"50259": {
|
63 |
+
"content": " ",
|
64 |
+
"lstrip": false,
|
65 |
+
"normalized": true,
|
66 |
+
"rstrip": false,
|
67 |
+
"single_word": false,
|
68 |
+
"special": false
|
69 |
+
},
|
70 |
+
"50260": {
|
71 |
+
"content": " ",
|
72 |
+
"lstrip": false,
|
73 |
+
"normalized": true,
|
74 |
+
"rstrip": false,
|
75 |
+
"single_word": false,
|
76 |
+
"special": false
|
77 |
+
},
|
78 |
+
"50261": {
|
79 |
+
"content": " ",
|
80 |
+
"lstrip": false,
|
81 |
+
"normalized": true,
|
82 |
+
"rstrip": false,
|
83 |
+
"single_word": false,
|
84 |
+
"special": false
|
85 |
+
},
|
86 |
+
"50262": {
|
87 |
+
"content": " ",
|
88 |
+
"lstrip": false,
|
89 |
+
"normalized": true,
|
90 |
+
"rstrip": false,
|
91 |
+
"single_word": false,
|
92 |
+
"special": false
|
93 |
+
},
|
94 |
+
"50263": {
|
95 |
+
"content": " ",
|
96 |
+
"lstrip": false,
|
97 |
+
"normalized": true,
|
98 |
+
"rstrip": false,
|
99 |
+
"single_word": false,
|
100 |
+
"special": false
|
101 |
+
},
|
102 |
+
"50264": {
|
103 |
+
"content": " ",
|
104 |
+
"lstrip": false,
|
105 |
+
"normalized": true,
|
106 |
+
"rstrip": false,
|
107 |
+
"single_word": false,
|
108 |
+
"special": false
|
109 |
+
},
|
110 |
+
"50265": {
|
111 |
+
"content": " ",
|
112 |
+
"lstrip": false,
|
113 |
+
"normalized": true,
|
114 |
+
"rstrip": false,
|
115 |
+
"single_word": false,
|
116 |
+
"special": false
|
117 |
+
},
|
118 |
+
"50266": {
|
119 |
+
"content": " ",
|
120 |
+
"lstrip": false,
|
121 |
+
"normalized": true,
|
122 |
+
"rstrip": false,
|
123 |
+
"single_word": false,
|
124 |
+
"special": false
|
125 |
+
},
|
126 |
+
"50267": {
|
127 |
+
"content": " ",
|
128 |
+
"lstrip": false,
|
129 |
+
"normalized": true,
|
130 |
+
"rstrip": false,
|
131 |
+
"single_word": false,
|
132 |
+
"special": false
|
133 |
+
},
|
134 |
+
"50268": {
|
135 |
+
"content": " ",
|
136 |
+
"lstrip": false,
|
137 |
+
"normalized": true,
|
138 |
+
"rstrip": false,
|
139 |
+
"single_word": false,
|
140 |
+
"special": false
|
141 |
+
},
|
142 |
+
"50269": {
|
143 |
+
"content": " ",
|
144 |
+
"lstrip": false,
|
145 |
+
"normalized": true,
|
146 |
+
"rstrip": false,
|
147 |
+
"single_word": false,
|
148 |
+
"special": false
|
149 |
+
},
|
150 |
+
"50270": {
|
151 |
+
"content": " ",
|
152 |
+
"lstrip": false,
|
153 |
+
"normalized": true,
|
154 |
+
"rstrip": false,
|
155 |
+
"single_word": false,
|
156 |
+
"special": false
|
157 |
+
},
|
158 |
+
"50271": {
|
159 |
+
"content": " ",
|
160 |
+
"lstrip": false,
|
161 |
+
"normalized": true,
|
162 |
+
"rstrip": false,
|
163 |
+
"single_word": false,
|
164 |
+
"special": false
|
165 |
+
},
|
166 |
+
"50272": {
|
167 |
+
"content": " ",
|
168 |
+
"lstrip": false,
|
169 |
+
"normalized": true,
|
170 |
+
"rstrip": false,
|
171 |
+
"single_word": false,
|
172 |
+
"special": false
|
173 |
+
},
|
174 |
+
"50273": {
|
175 |
+
"content": " ",
|
176 |
+
"lstrip": false,
|
177 |
+
"normalized": true,
|
178 |
+
"rstrip": false,
|
179 |
+
"single_word": false,
|
180 |
+
"special": false
|
181 |
+
},
|
182 |
+
"50274": {
|
183 |
+
"content": " ",
|
184 |
+
"lstrip": false,
|
185 |
+
"normalized": true,
|
186 |
+
"rstrip": false,
|
187 |
+
"single_word": false,
|
188 |
+
"special": false
|
189 |
+
},
|
190 |
+
"50275": {
|
191 |
+
"content": " ",
|
192 |
+
"lstrip": false,
|
193 |
+
"normalized": true,
|
194 |
+
"rstrip": false,
|
195 |
+
"single_word": false,
|
196 |
+
"special": false
|
197 |
+
},
|
198 |
+
"50276": {
|
199 |
+
"content": " ",
|
200 |
+
"lstrip": false,
|
201 |
+
"normalized": true,
|
202 |
+
"rstrip": false,
|
203 |
+
"single_word": false,
|
204 |
+
"special": false
|
205 |
+
},
|
206 |
+
"50277": {
|
207 |
+
"content": "|||EMAIL_ADDRESS|||",
|
208 |
+
"lstrip": false,
|
209 |
+
"normalized": true,
|
210 |
+
"rstrip": false,
|
211 |
+
"single_word": false,
|
212 |
+
"special": false
|
213 |
+
},
|
214 |
+
"50278": {
|
215 |
+
"content": "|||PHONE_NUMBER|||",
|
216 |
+
"lstrip": false,
|
217 |
+
"normalized": true,
|
218 |
+
"rstrip": false,
|
219 |
+
"single_word": false,
|
220 |
+
"special": false
|
221 |
+
},
|
222 |
+
"50279": {
|
223 |
+
"content": "<|endoftext|>",
|
224 |
+
"lstrip": false,
|
225 |
+
"normalized": false,
|
226 |
+
"rstrip": false,
|
227 |
+
"single_word": false,
|
228 |
+
"special": true
|
229 |
+
}
|
230 |
+
},
|
231 |
+
"bos_token": null,
|
232 |
+
"clean_up_tokenization_spaces": true,
|
233 |
+
"eos_token": "<|endoftext|>",
|
234 |
+
"model_max_length": 1000000000000000019884624838656,
|
235 |
+
"pad_token": "<|padding|>",
|
236 |
+
"tokenizer_class": "GPTNeoXTokenizer",
|
237 |
+
"unk_token": null
|
238 |
+
}
|
model_name=0--step=1199-consumed_samples=614400.0/weights/.metadata
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1cca533b8f016500a29ba2eedeb52952409aa5774a1a9e518e0edf8754b6a775
|
3 |
+
size 272080
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__0_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:318daa35aaa3daaa750b2038d9c8c9cc83f8bd9a7fea19cb3a2d7169eb496fbe
|
3 |
+
size 938897288
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__0_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b5fa8c5be0aa36993539e9532ca1af4978e749e22845768f522e1d6600b4558a
|
3 |
+
size 940460984
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__1_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d6c8a42986905c8421074f1d86572e5b9f41b7968d3a6ec7cfabe9e2bd9d7e00
|
3 |
+
size 938851248
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__2_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e5b6823462b60d91972586e6e3ab7fb6deb405d42095995872a1f9bc40fa27f3
|
3 |
+
size 937781988
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__2_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c733c88dee30045a9d10ce7976046dd4f685b8279c4f175da01e9e3cee2eb8f3
|
3 |
+
size 944982044
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__3_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c9145de689fedb8124f87c98762311f2c165da876fea681964b51e5fd4a9148
|
3 |
+
size 943954152
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__4_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5fab3877d26bca3a3ea0e6ba8cfe3ed41af2674b3b020bfdb9dad17c2823a3c2
|
3 |
+
size 941969992
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__5_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f386507be000e0799bb6197de7bed8d962ce73aac81e85b0964ebc0922fadd4f
|
3 |
+
size 941995240
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__5_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e8831951bb9aca35793d72d258c7e8a7f565b7c6022bc04ca3fd03324fb12b4
|
3 |
+
size 945017376
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__6_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e275d2beddace7b1019ea5e91772b897700a49776f956cd6bade85dfb4962d9
|
3 |
+
size 941969992
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__6_1.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:315ca31f48492e945ecef18428c40a7ef158fa3f626cfa07915957df43354f4e
|
3 |
+
size 936770304
|
model_name=0--step=1199-consumed_samples=614400.0/weights/__7_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:870a9a4fbe3989e157ba79a1aa527631852c97599c462a4f4c2581213d4f1f57
|
3 |
+
size 938826468
|
model_name=0--step=1199-consumed_samples=614400.0/weights/common.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5dd0140650cc8daefcdf2767288c7e6d71d829e35ff91d3782ab712fd65d1e5
|
3 |
+
size 9822
|
model_name=0--step=1199-consumed_samples=614400.0/weights/metadata.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"sharded_backend": "torch_dist", "sharded_backend_version": 1, "common_backend": "torch", "common_backend_version": 1}
|
model_name=0--step=1299-consumed_samples=665600.0-last-unfinished
ADDED
File without changes
|
model_name=0--step=1299-consumed_samples=665600.0-unfinished
ADDED
File without changes
|
model_name=0--step=1299-consumed_samples=665600.0/context/54c4eb36-2ea5-4b54-a535-213b8bd850a2
ADDED
Binary file (173 Bytes). View file
|
|