File size: 4,038 Bytes
72d5ec2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
checkpoints:
checkpoint_interval: 500
checkpoints_path: /scratch/craffel/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredwikiteam-seed-6-
checkpoints_path_is_shared_file_system: false
resume_checkpoint_path: s3://comma-v0.1-ablations/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredwikiteam-seed-6-
save_initial_state: true
data:
dataset:
dataloader_type: single
dataset_max_tokens: null
dataset_weights: null
datasets:
- bits_per_token: 16
filename_pattern: .*\.ds$
folder: /scratch/dataset/commav0p1-ablations-1p82G-commonpile0p1filteredwikiteam-seed-6-/
original_folder: null
seed: 6
shuffle: true
skip_tokens: 0
pad_samples_to_global_batch_size: false
skip_in_stream: true
num_loading_workers: 0
seed: 6
experiment_logger:
tensorboard_logger:
push_to_hub_interval: 300
repo_id: craffel/commav0p1-ablations
repo_public: false
tensorboard_dir: /scratch/craffel/tensorboard-craffel-commav0p1-ablations
wandb_logger: null
general:
benchmark_csv_path: null
consumed_train_samples: 14336000
ignore_sanity_checks: true
project: commav0p1-ablations
run: commav0p1-ablations-1p82G-commonpile0p1filteredwikiteam-seed-6-
seed: 42
step: 14000
kill_switch_path: null
lighteval:
batch_size: 16
checkpoints_path: null
generation: null
logging:
hub_repo_details: null
hub_repo_results: null
hub_repo_tensorboard: craffel/commav0p1-ablations
local_output_path: /scratch/craffel/lighteval/commav0p1-ablations-1p82G-commonpile0p1filteredwikiteam-seed-6-
push_details_to_hub: false
push_results_to_hub: false
push_results_to_tensorboard: true
tensorboard_metric_prefix: e
parallelism:
dp: 8
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
tp: 1
tp_linear_async_communication: false
tp_mode: ALL_REDUCE
slurm_script_dir: /fsx/craffel/train/eval-scripts
slurm_template: /fsx/craffel/run_eval.slurm.jinja
tasks:
custom_tasks: brrr.lighteval.evaluation_tasks
dataset_loading_processes: 8
max_samples: 1000
multichoice_continuations_start_space: null
no_multichoice_continuations_start_space: null
num_fewshot_seeds: null
tasks: early-signal
wandb: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
std: 0.02
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 1
eos_token_id: 2
hidden_act: silu
hidden_size: 2048
initializer_range: 0.02
intermediate_size: 8192
is_llama_config: true
max_position_embeddings: 2048
num_attention_heads: 32
num_hidden_layers: 24
num_key_value_heads: 32
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
tie_word_embeddings: true
use_cache: true
vocab_size: 50272
optimizer:
accumulate_grad_in_fp32: true
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.0003
lr_decay_starting_step: null
lr_decay_steps: null
lr_decay_style: cosine
lr_warmup_steps: 500
lr_warmup_style: linear
min_decay_lr: 3.0e-05
torch_adam_is_fused: true
weight_decay: 0.1
zero_stage: 0
parallelism:
dp: 64
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
tp: 1
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
profiler: null
s3_upload:
remove_after_upload: true
s5cmd_concurrency: 5
s5cmd_numworkers: 16
s5cmd_path: /fsx/craffel/miniconda3/envs/exp/bin/s5cmd
upload_s3_path: s3://comma-v0.1-ablations/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredwikiteam-seed-6-
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: gpt2
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 4
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 4
sequence_length: 2048
train_steps: 14305
val_check_interval: 100
|