lr=2e-4 lora_rank=8 lora_alpha=32 lora_trainable="q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj" modules_to_save="embed_tokens,lm_head" lora_dropout=0.05 pretrained_model=./llama-7b-hf dna_eng_tokenizer_path=./merged_gene_eng_tokenizer_hf dataset_dir=./train_data data_cache=temp_data_cache_dir per_device_train_batch_size=4 per_device_eval_batch_size=4 gradient_accumulation_steps=8 output_dir=dnahlm_llama_7b deepspeed_config_file=ds_zero2_no_offload.json torchrun --nnodes 1 --nproc_per_node 10 run_clm_pt_with_peft.py \ --deepspeed ${deepspeed_config_file} \ --model_name_or_path ${pretrained_model} \ --tokenizer_name_or_path ${dna_eng_tokenizer_path} \ --dataset_dir ${dataset_dir} \ --data_cache_dir ${data_cache} \ --validation_split_percentage 0.001 \ --per_device_train_batch_size ${per_device_train_batch_size} \ --per_device_eval_batch_size ${per_device_eval_batch_size} \ --do_train \ --seed $RANDOM \ --fp16 \ --num_train_epochs 1 \ --lr_scheduler_type cosine \ --learning_rate ${lr} \ --warmup_ratio 0.05 \ --weight_decay 0.01 \ --logging_strategy steps \ --logging_steps 10 \ --save_strategy steps \ --save_total_limit 3 \ --save_steps 200 \ --gradient_accumulation_steps ${gradient_accumulation_steps} \ --preprocessing_num_workers 64 \ --block_size 256 \ --output_dir ${output_dir} \ --overwrite_output_dir \ --ddp_timeout 30000 \ --logging_first_step True \ --lora_rank ${lora_rank} \ --lora_alpha ${lora_alpha} \ --trainable ${lora_trainable} \ --modules_to_save ${modules_to_save} \ --lora_dropout ${lora_dropout} \ --torch_dtype float16 \ --gradient_checkpointing \ --ddp_find_unused_parameters False