File size: 4,032 Bytes
72d5ec2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
checkpoints:
  checkpoint_interval: 500
  checkpoints_path: /scratch/craffel/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredstackv2-seed-6-
  checkpoints_path_is_shared_file_system: false
  resume_checkpoint_path: s3://comma-v0.1-ablations/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredstackv2-seed-6-
  save_initial_state: true
data:
  dataset:
    dataloader_type: single
    dataset_max_tokens: null
    dataset_weights: null
    datasets:
    - bits_per_token: 16
      filename_pattern: .*\.ds$
      folder: /scratch/dataset/commav0p1-ablations-1p82G-commonpile0p1filteredstackv2-seed-6-/
      original_folder: null
      seed: 6
      shuffle: true
      skip_tokens: 0
    pad_samples_to_global_batch_size: false
    skip_in_stream: true
  num_loading_workers: 0
  seed: 6
experiment_logger:
  tensorboard_logger:
    push_to_hub_interval: 300
    repo_id: craffel/commav0p1-ablations
    repo_public: false
    tensorboard_dir: /scratch/craffel/tensorboard-craffel-commav0p1-ablations
  wandb_logger: null
general:
  benchmark_csv_path: null
  consumed_train_samples: 14336000
  ignore_sanity_checks: true
  project: commav0p1-ablations
  run: commav0p1-ablations-1p82G-commonpile0p1filteredstackv2-seed-6-
  seed: 42
  step: 14000
kill_switch_path: null
lighteval:
  batch_size: 16
  checkpoints_path: null
  generation: null
  logging:
    hub_repo_details: null
    hub_repo_results: null
    hub_repo_tensorboard: craffel/commav0p1-ablations
    local_output_path: /scratch/craffel/lighteval/commav0p1-ablations-1p82G-commonpile0p1filteredstackv2-seed-6-
    push_details_to_hub: false
    push_results_to_hub: false
    push_results_to_tensorboard: true
    tensorboard_metric_prefix: e
  parallelism:
    dp: 8
    expert_parallel_size: 1
    pp: 1
    pp_engine: 1f1b
    tp: 1
    tp_linear_async_communication: false
    tp_mode: ALL_REDUCE
  slurm_script_dir: /fsx/craffel/train/eval-scripts
  slurm_template: /fsx/craffel/run_eval.slurm.jinja
  tasks:
    custom_tasks: brrr.lighteval.evaluation_tasks
    dataset_loading_processes: 8
    max_samples: 1000
    multichoice_continuations_start_space: null
    no_multichoice_continuations_start_space: null
    num_fewshot_seeds: null
    tasks: early-signal
  wandb: null
logging:
  iteration_step_info_interval: 1
  log_level: info
  log_level_replica: info
model:
  ddp_bucket_cap_mb: 25
  dtype: bfloat16
  init_method:
    std: 0.02
  make_vocab_size_divisible_by: 1
  model_config:
    bos_token_id: 1
    eos_token_id: 2
    hidden_act: silu
    hidden_size: 2048
    initializer_range: 0.02
    intermediate_size: 8192
    is_llama_config: true
    max_position_embeddings: 2048
    num_attention_heads: 32
    num_hidden_layers: 24
    num_key_value_heads: 32
    pad_token_id: null
    pretraining_tp: 1
    rms_norm_eps: 1.0e-05
    rope_scaling: null
    tie_word_embeddings: true
    use_cache: true
    vocab_size: 50272
optimizer:
  accumulate_grad_in_fp32: true
  adam_beta1: 0.9
  adam_beta2: 0.95
  adam_eps: 1.0e-08
  clip_grad: 1.0
  learning_rate_scheduler:
    learning_rate: 0.0003
    lr_decay_starting_step: null
    lr_decay_steps: null
    lr_decay_style: cosine
    lr_warmup_steps: 500
    lr_warmup_style: linear
    min_decay_lr: 3.0e-05
  torch_adam_is_fused: true
  weight_decay: 0.1
  zero_stage: 0
parallelism:
  dp: 64
  expert_parallel_size: 1
  pp: 1
  pp_engine: 1f1b
  tp: 1
  tp_linear_async_communication: true
  tp_mode: REDUCE_SCATTER
profiler: null
s3_upload:
  remove_after_upload: true
  s5cmd_concurrency: 5
  s5cmd_numworkers: 16
  s5cmd_path: /fsx/craffel/miniconda3/envs/exp/bin/s5cmd
  upload_s3_path: s3://comma-v0.1-ablations/checkpoints/commav0p1-ablations-1p82G-commonpile0p1filteredstackv2-seed-6-
tokenizer:
  tokenizer_max_length: null
  tokenizer_name_or_path: gpt2
  tokenizer_revision: null
tokens:
  batch_accumulation_per_replica: 4
  limit_test_batches: 0
  limit_val_batches: 0
  micro_batch_size: 4
  sequence_length: 2048
  train_steps: 14305
  val_check_interval: 100