Transformers
English
Mixture of Experts
olmo
flexolmo
Flex-public-7B-1T / config.json
swj0419's picture
Initial model upload
f45d541 verified
{"run_name": "OLMo2-7B-anneal-from-stage1-no-math", "launch": {"name": "OLMo2-7B-anneal-from-stage1-no-math-train-497f9f22", "cmd": ["src/scripts/train/OLMo2-7B-linear-decay.py", "train", "OLMo2-7B-anneal-from-stage1-no-math", "/weka/oe-training-default/ai2-llm/checkpoints/akshitab/OLMo2-7B-stage1-step928646", "ai2/jupiter-cirrascale-2", "--launch.num_nodes=8", "--launch.workspace=OLMo-modular", "--launch.beaker_image=petew/olmo-core-tch260cu124", "--launch.priority=urgent", "--trainer.callbacks.wandb.enabled=True", "--trainer.callbacks.comet.enabled=False", "--trainer.callbacks.lm_evaluator.enabled=False", "--trainer.callbacks.downstream_evaluator.enabled=True", "--trainer.max_duration.value=50000000000", "--trainer.max_duration.unit=tokens", "--dataset.mix=dolmino_minus_math", "--dataset.mix_base_dir=/weka/oe-training-default/ai2-llm/", "--train_module.float8_config.enabled=true", "--train_module.optim.lr=0.000061499", "--train_module.scheduler.warmup_steps=0", "--train_module.scheduler.alpha_f=0"], "budget": "ai2/oe-training", "task_name": "train", "workspace": "OLMo-modular", "setup_steps": ["conda install gh --channel conda-forge", "gh repo clone \"$REPO_URL\" .", "git checkout \"$GIT_REF\"", "git submodule update --init --recursive", "conda shell.bash activate base", "pip install -e '.[dev,beaker,wandb,train]'", "pip freeze", "mkdir -p ~/.aws", "printenv AWS_CONFIG > ~/.aws/config", "printenv AWS_CREDENTIALS > ~/.aws/credentials"], "beaker_image": "petew/olmo-core-tch260cu124", "num_nodes": 8, "num_gpus": 8, "shared_memory": "10GiB", "clusters": ["ai2/jupiter-cirrascale-2"], "shared_filesystem": true, "priority": "urgent", "preemptible": true, "env_vars": [{"name": "NCCL_DEBUG", "value": "WARN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvVar"}, {"name": "CUDA_LAUNCH_BLOCKING", "value": "0", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvVar"}], "env_secrets": [{"name": "GITHUB_TOKEN", "secret": "weijias_GITHUB_TOKEN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "BEAKER_TOKEN", "secret": "weijias_BEAKER_TOKEN", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "WANDB_API_KEY", "secret": "weijias_WANDB_API_KEY", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "COMET_API_KEY", "secret": "weijias_COMET_API_KEY", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "AWS_CONFIG", "secret": "weijias_AWS_CONFIG", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "AWS_CREDENTIALS", "secret": "weijias_AWS_CREDENTIALS", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "R2_ENDPOINT_URL", "secret": "R2_ENDPOINT_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "WEKA_ENDPOINT_URL", "secret": "WEKA_ENDPOINT_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}, {"name": "SLACK_WEBHOOK_URL", "secret": "SLACK_WEBHOOK_URL", "_CLASS_": "olmo_core.launch.beaker.BeakerEnvSecret"}], "nfs": false, "weka_buckets": [{"bucket": "oe-training-default", "mount": "/weka/oe-training-default", "_CLASS_": "olmo_core.launch.beaker.BeakerWekaBucket"}], "allow_dirty": false, "_CLASS_": "olmo_core.launch.beaker.BeakerLaunchConfig"}, "model": {"d_model": 4096, "vocab_size": 100352, "n_layers": 32, "block": {"attention": {"name": "default", "n_heads": 32, "bias": false, "rope": {"name": "default", "theta": 500000, "full_precision": true, "_CLASS_": "olmo_core.nn.rope.RoPEConfig"}, "qk_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "use_flash": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.attention.AttentionConfig"}, "layer_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "feed_forward": {"hidden_size": 11008, "name": "default", "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"}, "name": "reordered_norm", "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"}, "lm_head": {"name": "default", "layer_norm": {"name": "rms", "eps": 1e-06, "bias": false, "dtype": "float32", "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"}, "bias": false, "dtype": "float32", "loss_implementation": "default", "_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig"}, "name": "default", "dtype": "float32", "init_method": "normal", "init_seed": 0, "_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig"}, "dataset": {"tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "name": "fsl", "sequence_length": 4096, "max_target_sequence_length": 8192, "mix": "dolmino_minus_math", "mix_base_dir": "/weka/oe-training-default/ai2-llm/", "include_instance_metadata": true, "generate_doc_lengths": false, "expand_glob": false, "work_dir": "/weka/oe-training-default/ai2-llm/checkpoints/weijias/OLMo2-7B-anneal-from-stage1-no-math/dataset-cache", "_CLASS_": "olmo_core.data.numpy_dataset.NumpyDatasetConfig"}, "data_loader": {"global_batch_size": 4194304, "seed": 34521, "num_workers": 4, "_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig"}, "train_module": {"rank_microbatch_size": 8192, "max_sequence_length": 4096, "optim": {"group_overrides": [{"params": ["embeddings.weight"], "opts": {"weight_decay": 0.0}, "_CLASS_": "olmo_core.optim.config.OptimGroupOverride"}], "compile": false, "fixed_fields": ["initial_lr"], "lr": 6.1499e-05, "betas": [0.9, 0.95], "eps": 1e-08, "weight_decay": 0.1, "fused": true, "_CLASS_": "olmo_core.optim.adamw.AdamWConfig"}, "max_grad_norm": 1.0, "scheduler": {"lr_field": "lr", "initial_lr_field": "initial_lr", "alpha_f": 0.0, "warmup_steps": 0, "warmup_min_lr": 0.0, "_CLASS_": "olmo_core.optim.scheduler.LinearWithWarmup"}, "compile_model": true, "float8_config": {"scaling_type_input": "dynamic", "scaling_type_weight": "dynamic", "scaling_type_grad_output": "dynamic", "enable_fsdp_float8_all_gather": true, "precompute_float8_dynamic_scale_for_fsdp": true, "force_recompute_fp8_weight_in_bwd": true, "compile": true, "enabled": true, "_CLASS_": "olmo_core.float8.Float8Config"}, "dp_config": {"name": "hsdp", "param_dtype": "bfloat16", "reduce_dtype": "float32", "wrapping_strategy": "blocks", "prefetch_factor": 0, "_CLASS_": "olmo_core.train.train_module.transformer.TransformerDataParallelConfig"}, "z_loss_multiplier": 1e-05, "state_dict_save_opts": {"flatten_optimizer_state_dict": true}, "label_ignore_index": -100, "_CLASS_": "olmo_core.train.train_module.transformer.TransformerTrainModuleConfig"}, "trainer": {"save_folder": "/weka/oe-training-default/ai2-llm/checkpoints/weijias/OLMo2-7B-anneal-from-stage1-no-math", "load_strategy": "if_available", "checkpointer": {"pre_download": false, "throttle_uploads": false, "_CLASS_": "olmo_core.train.checkpoint.CheckpointerConfig"}, "save_overwrite": true, "max_duration": {"value": 50000000000, "unit": "tokens", "_CLASS_": "olmo_core.train.common.Duration"}, "cancel_check_interval": 1, "metrics_collect_interval": 10, "callbacks": {"downstream_evaluator": {"tasks": ["piqa", "hellaswag", "winogrande", "openbook_qa", "boolq", "sciq", "xsum", "wildbench_math", "wildbench_reasoning", "wildbench_coding_debugging", "wildbench_creative_writing", "mmlu_stem_val_rc_5shot", "mmlu_humanities_val_rc_5shot", "mmlu_social_sciences_val_rc_5shot", "mmlu_other_val_rc_5shot"], "tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "eval_interval": 1000, "eval_duration": {"value": 1, "unit": "epochs", "_CLASS_": "olmo_core.train.common.Duration"}, "log_interval": 5, "enabled": true, "_CLASS_": "olmo_modular.eval.evaluator_callback.DownstreamEvaluatorUpdatedCallbackConfig"}, "checkpointer": {"save_interval": 10000, "ephemeral_save_interval": 250, "save_async": true, "remove": "ephemeral_only", "enabled": true, "_CLASS_": "olmo_core.train.callbacks.checkpointer.CheckpointerCallback"}, "comet": {"enabled": false, "name": "OLMo2-7B-anneal-from-stage1-no-math", "project": "OLMo-modular", "workspace": "ai2", "cancel_tags": ["cancel", "canceled", "cancelled"], "cancel_check_interval": 10, "notifications": "none", "failure_tag": "failed", "_CLASS_": "olmo_core.train.callbacks.comet.CometCallback"}, "wandb": {"enabled": true, "name": "OLMo2-7B-anneal-from-stage1-no-math", "project": "OLMo-modular", "entity": "ai2-llm", "cancel_tags": ["cancel", "canceled", "cancelled"], "cancel_check_interval": 10, "_CLASS_": "olmo_core.train.callbacks.wandb.WandBCallback"}, "config_saver": {"fname": "config.json", "_CLASS_": "olmo_core.train.callbacks.config_saver.ConfigSaverCallback"}, "profiler": {"skip_first": 0, "wait": 1, "warmup": 5, "active": 3, "repeat": 1, "enabled": false, "_CLASS_": "olmo_core.train.callbacks.profiler.ProfilerCallback"}, "garbage_collector": {"gc_interval": 1000, "enabled": true, "_CLASS_": "olmo_core.train.callbacks.garbage_collector.GarbageCollectorCallback"}, "slack_notifier": {"name": "OLMo2-7B-anneal-from-stage1-no-math", "notifications": "end_only", "enabled": false, "_CLASS_": "olmo_core.train.callbacks.slack_notifier.SlackNotifierCallback"}, "beaker": {"enabled": true, "_CLASS_": "olmo_core.train.callbacks.beaker.BeakerCallback"}, "gpu_monitor": {"_CLASS_": "olmo_core.train.callbacks.gpu_memory_monitor.GPUMemoryMonitorCallback"}, "lm_evaluator": {"eval_dataset": {"tokenizer": {"vocab_size": 100278, "eos_token_id": 100257, "pad_token_id": 100277, "identifier": "allenai/dolma2-tokenizer", "_CLASS_": "olmo_core.data.tokenizer.TokenizerConfig"}, "name": "padded_fsl", "sequence_length": 4096, "mix": "v3-small-ppl-validation", "mix_base_dir": "/weka/oe-training-default/ai2-llm", "include_instance_metadata": true, "generate_doc_lengths": false, "expand_glob": false, "work_dir": "/weka/oe-training-default/ai2-llm/checkpoints/weijias/dataset-cache", "_CLASS_": "olmo_core.data.numpy_dataset.NumpyDatasetConfig"}, "eval_interval": 1000, "eval_duration": {"value": 1, "unit": "epochs", "_CLASS_": "olmo_core.train.common.Duration"}, "log_interval": 5, "enabled": false, "_CLASS_": "olmo_core.train.callbacks.evaluator_callback.LMEvaluatorCallbackConfig"}}, "no_checkpoints": false, "no_evals": false, "_CLASS_": "olmo_core.train.config.TrainerConfig"}, "init_seed": 12536, "_CLASS_": "olmo_modular.internal.experiment.ExperimentConfig"}