diff --git "a/train_video.log" "b/train_video.log" new file mode 100644--- /dev/null +++ "b/train_video.log" @@ -0,0 +1,688 @@ +[2023-10-13 02:59:14,478] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-10-13 02:59:16,541] [WARNING] [runner.py:196:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2023-10-13 02:59:16,541] [INFO] [runner.py:555:main] cmd = /usr/local/miniconda3/envs/llava/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None llava/train/train_mem_video.py --deepspeed ./scripts/zero2.json --lora_enable True --model_name_or_path /hy-tmp/vicuna-7b-v1.3 --version v1 --data_path ./data/avsd_train_omni.json --video_folder /hy-tmp/Charades_v1_480 --vision_tower /hy-tmp/clip-vit-large-patch14 --pretrain_mm_mlp_adapter /hy-tmp/llava-pretrain-vicuna-7b-v1.3/mm_projector.bin --mm_vision_select_layer -2 --mm_use_im_start_end False --mm_use_im_patch_token False --bf16 True --output_dir /hy-tmp/checkpoints/omni-vicuna-7b-v1.3-finetune_lora --num_train_epochs 8 --per_device_train_batch_size 8 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --evaluation_strategy no --save_strategy steps --save_steps 100 --save_total_limit 3 --learning_rate 2e-5 --weight_decay 0. --warmup_ratio 0.03 --lr_scheduler_type cosine --logging_steps 1 --tf32 True --model_max_length 2048 --gradient_checkpointing True --lazy_preprocess True --dataloader_num_workers 8 --report_to wandb +[2023-10-13 02:59:17,802] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-10-13 02:59:19,574] [INFO] [launch.py:138:main] 0 NCCL_P2P_LEVEL=NVL +[2023-10-13 02:59:19,574] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1]} +[2023-10-13 02:59:19,574] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=2, node_rank=0 +[2023-10-13 02:59:19,574] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1]}) +[2023-10-13 02:59:19,574] [INFO] [launch.py:163:main] dist_world_size=2 +[2023-10-13 02:59:19,574] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1 +[2023-10-13 02:59:22,389] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-10-13 02:59:22,433] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-10-13 02:59:22,977] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2023-10-13 02:59:22,977] [INFO] [comm.py:594:init_distributed] cdb=None +[2023-10-13 02:59:22,977] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2023-10-13 02:59:23,051] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented +[2023-10-13 02:59:23,051] [INFO] [comm.py:594:init_distributed] cdb=None +You are using a model of type llama to instantiate a model of type omni. This is not supported for all configurations of models and can yield errors. +You are using a model of type llama to instantiate a model of type omni. This is not supported for all configurations of models and can yield errors. + Loading checkpoint shards: 0%| | 0/2 [00:00. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565 +You are using the legacy behaviour of the . This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565 +Formatting inputs...Skip in lazy mode +Rank: 0 partition count [2, 2] and sizes[(82444288, False), (2176, False)] +Rank: 1 partition count [2, 2] and sizes[(82444288, False), (2176, False)] +wandb: Currently logged in as: wanghao-cst. Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.15.12 +wandb: Run data is saved locally in /root/Omni-LLM/wandb/run-20231013_030309-30lhy90r +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run fiery-dew-9 +wandb: ⭐️ View project at https://wandb.ai/wanghao-cst/huggingface +wandb: 🚀 View run at https://wandb.ai/wanghao-cst/huggingface/runs/30lhy90r + 0%| | 0/616 [00:00