name: fineinstructions_ipt_fineinstructions_all_exp_chat | |
dump_dir: /fsx/craffel/fineinstructions/pretraining/ipt_fineinstructions_all_exp_chat/ | |
seed: 777 | |
grad_acc_steps: 8 | |
gc_collect_freq: 1000 | |
probe_freq: null | |
steps: 22000 | |
data: | |
root_dir: /scratch/craffel/lingua/data/fineinstructions/ | |
sources: | |
ipt_fineinstructions_all_exp_chat: 1.0 | |
batch_size: 4 | |
seq_len: 4096 | |
n_views: 2 | |
seed: 42 | |
add_bos: true | |
add_eos: true | |
load_async: true | |
prefetch_size: 1024 | |
tokenizer: | |
name: tiktoken | |
path: /fsx/craffel/lingua/tokenizers/llama3.model | |
n_words: null | |
optim: | |
lr: 0.001 | |
weight_decay: 0.1 | |
epsilon: 1.0e-08 | |
beta1: 0.9 | |
beta2: 0.95 | |
clip: 1.0 | |
scheduler: cosine | |
warmup: 2000 | |
lr_min_ratio: 1.0e-06 | |
cycle_length: 1.0 | |
cosine_theta: 1.0 | |
annealing_step: 1000 | |
decay_fraction: 0.1 | |
exp_factor: 0.5 | |
model: | |
dim: 2048 | |
n_layers: 25 | |
head_dim: null | |
n_heads: 16 | |
n_kv_heads: null | |
ffn_dim_multiplier: null | |
multiple_of: 256 | |
norm_eps: 1.0e-05 | |
rope_theta: 10000.0 | |
init_base_std: null | |
init_std_factor: disabled | |
max_seqlen: 4096 | |
seed: 42 | |
vocab_size: 128256 | |
weight_tying: false | |
sliding_window: null | |
distributed: | |
dp_shard: 1 | |
dp_replicate: 8 | |
tp_size: 1 | |
selective_activation_checkpointing: false | |
compile: true | |
fsdp_type: full_shard | |
model_dtype: bf16 | |
float8_recipe: null | |
float8_filter: layers\.[0-9]+\. | |
matmul_allow_tf32: false | |
detect_anomaly: false | |
compile_cache_size_limit: 8 | |
spawn_method: forkserver | |
env: | |
MKL_SERVICE_FORCE_INTEL: GNU | |
OMP_NUM_THREADS: '1' | |
MKL_NUM_THREADS: '1' | |
ENABLE_INTRA_NODE_COMM: '1' | |
TORCH_NCCL_AVOID_RECORD_STREAMS: '1' | |
NCCL_IB_TIMEOUT: '22' | |
NCCL_DEBUG: INFO | |
TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' | |
checkpoint: | |
dump: | |
every: 2000 | |
keep: -1 | |
eval: | |
every: 2000 | |
keep: -1 | |
path: /fsx/craffel/fineinstructions/pretraining/ipt_fineinstructions_all_exp_chat/checkpoints | |
init_ckpt_path: null | |
load_init_optimizer_state: false | |
save_init_ckpt: false | |
profiling: | |
run: true | |
trace_folder: profiling | |
mem_warmup: 0 | |
mem_steps: 4 | |
profile_warmup: 100 | |
profile_steps: 4 | |
logging: | |
freq: 1 | |
acc_freq: null | |
wandb: null | |
async_eval_gpus: 8 | |
eval: | |
harness: | |
apply_chat_template: true | |
tasks: | |
- hellaswag | |
- mmlu | |
- commonsense_qa | |
- sciq | |
confirm_run_unsafe_code: true | |
generator: | |
max_tokens: 8192 | |
dtype: bf16 | |