Spaces:
Running
Running
| #!/bin/bash | |
| # Example of running python script in a batch mode | |
| #SBATCH -J smi-ted-train | |
| #SBATCH -t 6:00:00 | |
| #SBATCH -o output_smi_ted_light_epoch50_%j.out | |
| #SBATCH --mem=64G | |
| #SBATCH --nodes=6 | |
| #SBATCH --ntasks=6 | |
| #SBATCH --gpus-per-task=4 | |
| #SBATCH --cpus-per-task=12 | |
| nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) | |
| nodes_array=($nodes) | |
| head_node=${nodes_array[0]} | |
| head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) | |
| echo Node IP: $head_node_ip | |
| export LOGLEVEL=INFO | |
| # Load software | |
| # module load anaconda3 | |
| source /home/.bashrc | |
| conda activate smi-ted-env | |
| # Run python script | |
| srun torchrun \ | |
| --nnodes 6 \ | |
| --nproc_per_node 4 \ | |
| --rdzv_id $RANDOM \ | |
| --rdzv_backend c10d \ | |
| --rdzv_endpoint $head_node_ip:29500 \ | |
| train_model_ED.py \ | |
| --device cuda \ | |
| --n_batch 288 \ | |
| --n_layer 12 \ | |
| --n_head 12 \ | |
| --n_embd 768 \ | |
| --max_len 202 \ | |
| --d_dropout 0.2 \ | |
| --lr_start 3e-5 \ | |
| --lr_multiplier 4 \ | |
| --lr_decoder 3e-5 \ | |
| --n_workers 12 \ | |
| --max_epochs 51 \ | |
| --gpu -1 \ | |
| --num_nodes 1 \ | |
| --num_feats 32 \ | |
| --root_dir . \ | |
| --checkpoint_every 10000 \ | |
| --grad_acc 1 \ | |
| --train_load 'pubchem' \ | |
| --smi_ted_version 'v1' \ | |
| --data_root './pubchem/pubchem_rd-canonical_smiles.smi' \ | |
| --save_checkpoint_path './light_checkpoints' \ | |
| --load_checkpoint_path '' \ | |
| --rotate \ | |
| --debug \ | |
| --model_arch 'BERT__both_rotate' \ |