diff --git "a/wandb/run-20241023_195332-5p9x2ymn/files/output.log" "b/wandb/run-20241023_195332-5p9x2ymn/files/output.log" new file mode 100644--- /dev/null +++ "b/wandb/run-20241023_195332-5p9x2ymn/files/output.log" @@ -0,0 +1,1725 @@ +***** Running training ***** + + +***** Evaluating at the beginning ***** + +***** Evaluating at the beginning ***** + +Evaluation: accuracy = 0.503532, reward_mean = 0.057497, reward_std = 1.434818 +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ +┃ prompt  ┃ higher-reward response  ┃ reward  ┃ lower-reward response  ┃ reward  ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ +│ USER: │ There are two bags visible in the image. │ -1.500000 │ There are four bags visible in the │ -2.468750 │ +│ How many bags are visible in the image? │ │ │ image. │ │ +│ │ │ │ │ │ +├──────────────────────────────────────────┼──────────────────────────────────────────┼───────────┼─────────────────────────────────────────┼───────────┤ +│ USER: │ Based on the signs and advertisements │ -0.730469 │ Based on the presence of multiple │ -0.429688 │ +│ What can you infer about the purpose of │ visible in the image, it can be inferred │ │ signs, including a large red sign that │ │ +│ the building with the pyramid-shaped │ that the building with the │ │ reads "Sale," it is likely that this │ │ +│ structure based on the signs and │ pyramid-shaped structure is likely a │ │ building serves as an outlet for retail │ │ +│ advertisements? │ shopping center or a commercial complex. │ │ or commercial purposes. The pyramid │ │ +│ │ The signs include names of various │ │ shape suggests that there might be some │ │ +│ │ brands and stores, such as "Ochsner," │ │ architectural significance to the │ │ +│ │ "Manor," and "Traita fina," which are │ │ design, but without more context, we │ │ +│ │ typically associated with retail and │ │ cannot definitively determine its │ │ +│ │ shopping. Additionally, the large text │ │ specific function. However, given the │ │ +│ │ "Shopping Center St. Jakob-Park" further │ │ promotional nature of the signs, one │ │ +│ │ supports the idea that this building is │ │ could reasonably assume that the │ │ +│ │ a shopping center. The prominent display │ │ building houses shops or businesses │ │ +│ │ of the word "BASEL" suggests that this │ │ offering discounts or special deals to │ │ +│ │ shopping center is located in or near │ │ attract customers. │ │ +│ │ Basel, Switzerland. │ │ │ │ +├──────────────────────────────────────────┼──────────────────────────────────────────┼───────────┼─────────────────────────────────────────┼───────────┤ +│ USER: │ Based on the lighting and shadows in the │ 1.367188 │ The image shows a plane at an airport │ 1.843750 │ +│ Based on the lighting and shadows, can │ image, it appears that the sun is │ │ terminal with lights in the distance. │ │ +│ you infer if the sun is rising or │ setting. The sky has a gradient from a │ │ The sky has hues of yellow and orange │ │ +│ setting? │ darker blue at the top to a lighter │ │ near the horizon which suggests that │ │ +│ │ yellow-orange near the horizon, which is │ │ it's either sunrise or sunset. However, │ │ +│ │ typical of sunset. Additionally, the │ │ there are no direct sunlight rays │ │ +│ │ lights at the airport are on, which is │ │ visible, so we cannot determine whether │ │ +│ │ more common during the evening as it │ │ this photo was taken during dawn │ │ +│ │ gets darker. │ │ (sunrise) or dusk (sunset). It could be │ │ +│ │ │ │ either one depending on where the │ │ +│ │ │ │ photographer took the picture from. If │ │ +│ │ │ │ they were standing facing east when │ │ +│ │ │ │ taking the shot, then it would likely │ │ +│ │ │ │ be morning; but if they faced west │ │ +│ │ │ │ while capturing the scene, it might │ │ +│ │ │ │ have been evening instead. │ │ +│ │ │ │ │ │ +│ │ │ │ It appears to be early morning based │ │ +│ │ │ │ off how dark everything else looks │ │ +│ │ │ │ except for those areas around the │ │ +│ │ │ │ horizon line indicating light coming up │ │ +│ │ │ │ over mountains behind them - suggesting │ │ +│ │ │ │ daytime hours ahead rather than │ │ +│ │ │ │ nighttime ones since all other parts │ │ +│ │ │ │ remain shadowed by darkness still │ │ +│ │ │ │ lingering outside city limits below us │ │ +│ │ │ │ here today... │ │ +└──────────────────────────────────────────┴──────────────────────────────────────────┴───────────┴─────────────────────────────────────────┴───────────┘ +[2024-10-23 20:14:41,735] [WARNING] [stage3.py:2102:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time +[2024-10-23 20:15:28,439] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[1.6759776536312848e-06, 1.6759776536312848e-06], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:15:28,439] [INFO] [timer.py:259:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=6.578970516447695, CurrSamplesPerSec=6.842125484552865, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 20:16:21,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[3.3519553072625697e-06, 3.3519553072625697e-06], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:16:21,424] [INFO] [timer.py:259:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=6.431077814404028, CurrSamplesPerSec=6.26027517521924, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:17:14,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[5.027932960893854e-06, 5.027932960893854e-06], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:17:14,488] [INFO] [timer.py:259:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=6.405149665522051, CurrSamplesPerSec=5.956437161370143, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:18:07,968] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[6.703910614525139e-06, 6.703910614525139e-06], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:18:07,969] [INFO] [timer.py:259:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=6.3669091247814995, CurrSamplesPerSec=6.720567984947453, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:19:02,490] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[8.379888268156424e-06, 8.379888268156424e-06], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:19:02,491] [INFO] [timer.py:259:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=6.320446009462997, CurrSamplesPerSec=5.529900591762047, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:19:52,082] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[1.0055865921787709e-05, 1.0055865921787709e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:19:52,082] [INFO] [timer.py:259:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=6.401533323095599, CurrSamplesPerSec=7.232248648011896, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:20:43,891] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[1.1731843575418994e-05, 1.1731843575418994e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:20:43,892] [INFO] [timer.py:259:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=6.411978149103351, CurrSamplesPerSec=6.568764867938658, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 20:21:36,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[1.3407821229050279e-05, 1.3407821229050279e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:21:36,422] [INFO] [timer.py:259:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=6.411006641040977, CurrSamplesPerSec=6.744699197726902, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 20:22:29,025] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[1.5083798882681566e-05, 1.5083798882681566e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:22:29,025] [INFO] [timer.py:259:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=6.4074557796403235, CurrSamplesPerSec=7.3564674770944505, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:23:18,874] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[1.675977653631285e-05, 1.675977653631285e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:23:18,875] [INFO] [timer.py:259:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=6.440189480664135, CurrSamplesPerSec=6.309385441444046, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 20:24:08,889] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[1.8435754189944135e-05, 1.8435754189944135e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:24:08,890] [INFO] [timer.py:259:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=6.4708942861341034, CurrSamplesPerSec=5.618396693249352, MemAllocated=25.43GB, MaxMemAllocated=33.76GB +[2024-10-23 20:25:02,179] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[2.0111731843575417e-05, 2.0111731843575417e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:25:02,180] [INFO] [timer.py:259:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=6.456984362927101, CurrSamplesPerSec=6.144167387412185, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:25:57,025] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[2.1787709497206706e-05, 2.1787709497206706e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:25:57,026] [INFO] [timer.py:259:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=6.430215628566529, CurrSamplesPerSec=6.456618564418233, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:26:48,130] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[2.346368715083799e-05, 2.346368715083799e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:26:48,131] [INFO] [timer.py:259:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=6.4420688676963564, CurrSamplesPerSec=6.758661623690714, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 20:27:41,038] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[2.5139664804469275e-05, 2.5139664804469275e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:27:41,038] [INFO] [timer.py:259:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=6.438048422047617, CurrSamplesPerSec=6.273993947233979, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 20:28:31,845] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[2.6815642458100557e-05, 2.6815642458100557e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:28:31,845] [INFO] [timer.py:259:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=6.450053379265499, CurrSamplesPerSec=6.974268899818332, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:29:22,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[2.8491620111731843e-05, 2.8491620111731843e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:29:22,686] [INFO] [timer.py:259:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=6.461290046159942, CurrSamplesPerSec=5.910871985154041, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 20:30:12,849] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:30:12,850] [INFO] [timer.py:259:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=6.475578501863396, CurrSamplesPerSec=6.954984015423453, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:31:05,417] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:31:05,418] [INFO] [timer.py:259:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=6.4728490495047994, CurrSamplesPerSec=5.962889021184955, MemAllocated=25.42GB, MaxMemAllocated=33.76GB +[2024-10-23 20:31:57,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:31:57,014] [INFO] [timer.py:259:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=6.474075900758789, CurrSamplesPerSec=6.842650114795313, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 20:32:49,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:32:49,078] [INFO] [timer.py:259:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=6.473643464007582, CurrSamplesPerSec=7.088643323744238, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 20:33:40,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:33:40,801] [INFO] [timer.py:259:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=6.475290597215718, CurrSamplesPerSec=6.540614837424684, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:34:35,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:34:35,428] [INFO] [timer.py:259:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=6.459470357858775, CurrSamplesPerSec=5.795399538680227, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 20:35:27,146] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:35:27,147] [INFO] [timer.py:259:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=6.46201539568019, CurrSamplesPerSec=6.859014992954133, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:36:16,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:36:16,192] [INFO] [timer.py:259:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=6.478781433586455, CurrSamplesPerSec=5.754448889080908, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:37:07,738] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:37:07,738] [INFO] [timer.py:259:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=6.479231455564791, CurrSamplesPerSec=6.281947724126389, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:37:58,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:37:58,941] [INFO] [timer.py:259:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=6.4817569823826595, CurrSamplesPerSec=6.547929648212891, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:38:52,337] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:38:52,338] [INFO] [timer.py:259:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=6.474394996687413, CurrSamplesPerSec=5.604792112057588, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 20:39:43,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:39:43,071] [INFO] [timer.py:259:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=6.480900126171035, CurrSamplesPerSec=6.330182216783231, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:40:36,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:40:36,687] [INFO] [timer.py:259:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=6.472587519307741, CurrSamplesPerSec=7.431390886046478, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:41:29,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:41:29,817] [INFO] [timer.py:259:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=6.468568419047268, CurrSamplesPerSec=6.404440580657717, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:42:20,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:42:20,464] [INFO] [timer.py:259:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=6.4746579989832425, CurrSamplesPerSec=7.387169101265565, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 20:43:10,990] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:43:10,990] [INFO] [timer.py:259:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=6.480090832933869, CurrSamplesPerSec=6.616624252461541, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:44:04,949] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:44:04,949] [INFO] [timer.py:259:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=6.472560659502512, CurrSamplesPerSec=6.101470269873934, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:44:55,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:44:55,945] [INFO] [timer.py:259:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=6.475129320313156, CurrSamplesPerSec=6.3408091347727, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 20:45:46,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:45:46,620] [INFO] [timer.py:259:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=6.478935963582719, CurrSamplesPerSec=7.480574201842291, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:46:37,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:46:37,023] [INFO] [timer.py:259:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=6.48381393574944, CurrSamplesPerSec=6.7097949866584194, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:47:27,962] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:47:27,963] [INFO] [timer.py:259:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=6.486339091606746, CurrSamplesPerSec=7.067725173771604, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 20:48:18,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:48:18,142] [INFO] [timer.py:259:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=6.4923677665748585, CurrSamplesPerSec=6.987089472512171, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:49:09,342] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:49:09,342] [INFO] [timer.py:259:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=6.49481640953235, CurrSamplesPerSec=6.193709434267256, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:50:04,846] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:50:04,847] [INFO] [timer.py:259:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=6.483074865091321, CurrSamplesPerSec=6.4349656290045925, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:50:54,705] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:50:54,706] [INFO] [timer.py:259:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=6.490301888515983, CurrSamplesPerSec=6.72343225927632, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:51:45,442] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:51:45,443] [INFO] [timer.py:259:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=6.494303195603065, CurrSamplesPerSec=6.524756246359842, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 20:52:36,610] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:52:36,610] [INFO] [timer.py:259:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=6.495535412097898, CurrSamplesPerSec=6.875365002625769, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:53:28,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:53:28,390] [INFO] [timer.py:259:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=6.495122584336824, CurrSamplesPerSec=8.03989456057454, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:54:18,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:54:18,591] [INFO] [timer.py:259:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=6.50042595723352, CurrSamplesPerSec=7.045691238060207, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:55:10,357] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:55:10,358] [INFO] [timer.py:259:stop] epoch=0/micro_step=470/global_step=470, RunningAvgSamplesPerSec=6.500202819307158, CurrSamplesPerSec=6.6960535718767575, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 20:56:01,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:56:01,078] [INFO] [timer.py:259:stop] epoch=0/micro_step=480/global_step=480, RunningAvgSamplesPerSec=6.502090749044869, CurrSamplesPerSec=6.476949604333462, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:56:52,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:56:52,797] [INFO] [timer.py:259:stop] epoch=0/micro_step=490/global_step=490, RunningAvgSamplesPerSec=6.502493021917951, CurrSamplesPerSec=7.413996791380327, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 20:57:45,247] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:57:45,248] [INFO] [timer.py:259:stop] epoch=0/micro_step=500/global_step=500, RunningAvgSamplesPerSec=6.501042992240088, CurrSamplesPerSec=7.039925405115405, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 20:58:36,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:58:36,254] [INFO] [timer.py:259:stop] epoch=0/micro_step=510/global_step=510, RunningAvgSamplesPerSec=6.5028544523942, CurrSamplesPerSec=6.201613268988083, MemAllocated=25.42GB, MaxMemAllocated=33.76GB +[2024-10-23 20:59:29,795] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 20:59:29,796] [INFO] [timer.py:259:stop] epoch=0/micro_step=520/global_step=520, RunningAvgSamplesPerSec=6.499106787272397, CurrSamplesPerSec=6.971757289464488, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:00:22,571] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:00:22,572] [INFO] [timer.py:259:stop] epoch=0/micro_step=530/global_step=530, RunningAvgSamplesPerSec=6.496824592444547, CurrSamplesPerSec=6.482133467490407, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:01:12,464] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:01:12,464] [INFO] [timer.py:259:stop] epoch=0/micro_step=540/global_step=540, RunningAvgSamplesPerSec=6.50068649905756, CurrSamplesPerSec=6.58101268316692, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:02:04,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:02:04,124] [INFO] [timer.py:259:stop] epoch=0/micro_step=550/global_step=550, RunningAvgSamplesPerSec=6.501328315632023, CurrSamplesPerSec=6.4245477766789305, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:02:56,953] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:02:56,954] [INFO] [timer.py:259:stop] epoch=0/micro_step=560/global_step=560, RunningAvgSamplesPerSec=6.4998194564705845, CurrSamplesPerSec=6.593002139100127, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:03:47,574] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:03:47,575] [INFO] [timer.py:259:stop] epoch=0/micro_step=570/global_step=570, RunningAvgSamplesPerSec=6.502907960726866, CurrSamplesPerSec=6.511506608975233, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:04:41,465] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:04:41,466] [INFO] [timer.py:259:stop] epoch=0/micro_step=580/global_step=580, RunningAvgSamplesPerSec=6.498625854354999, CurrSamplesPerSec=5.894297188660469, MemAllocated=25.42GB, MaxMemAllocated=33.76GB +[2024-10-23 21:05:32,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:05:32,388] [INFO] [timer.py:259:stop] epoch=0/micro_step=590/global_step=590, RunningAvgSamplesPerSec=6.500876392994717, CurrSamplesPerSec=6.476083933325555, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +Saving checkpoint at step 598 ... +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-23 21:06:30,212] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step598 is about to be saved! +[2024-10-23 21:06:30,213] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_598.bin, tag: global_step598 +[2024-10-23 21:06:30,213] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_598.bin... +[2024-10-23 21:06:57,684] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_598.bin. +[2024-10-23 21:06:57,684] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step598 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-23 21:07:11,387] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step598 is about to be saved! +[2024-10-23 21:07:11,388] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_598.bin, tag: global_step598 +[2024-10-23 21:07:11,389] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_598.bin... +[2024-10-23 21:07:42,464] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_598.bin. +[2024-10-23 21:07:42,464] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step598 is ready now! +Model saved! +Checkpoint saved. +[2024-10-23 21:07:51,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:07:51,762] [INFO] [timer.py:259:stop] epoch=0/micro_step=600/global_step=600, RunningAvgSamplesPerSec=6.496503530383252, CurrSamplesPerSec=6.736600950872196, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:08:41,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:08:41,639] [INFO] [timer.py:259:stop] epoch=0/micro_step=610/global_step=610, RunningAvgSamplesPerSec=6.501023818789453, CurrSamplesPerSec=5.899889844598779, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:09:30,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:09:30,674] [INFO] [timer.py:259:stop] epoch=0/micro_step=620/global_step=620, RunningAvgSamplesPerSec=6.50670654047025, CurrSamplesPerSec=7.222751535914935, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:10:22,155] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:10:22,155] [INFO] [timer.py:259:stop] epoch=0/micro_step=630/global_step=630, RunningAvgSamplesPerSec=6.5074029238711395, CurrSamplesPerSec=7.218749154779346, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:11:16,657] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:11:16,658] [INFO] [timer.py:259:stop] epoch=0/micro_step=640/global_step=640, RunningAvgSamplesPerSec=6.501239948351541, CurrSamplesPerSec=5.961729456808582, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:12:08,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:12:08,110] [INFO] [timer.py:259:stop] epoch=0/micro_step=650/global_step=650, RunningAvgSamplesPerSec=6.501861090465665, CurrSamplesPerSec=6.156364716698934, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:12:59,781] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:12:59,782] [INFO] [timer.py:259:stop] epoch=0/micro_step=660/global_step=660, RunningAvgSamplesPerSec=6.501574521142812, CurrSamplesPerSec=6.781142702142525, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 21:13:52,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:13:52,445] [INFO] [timer.py:259:stop] epoch=0/micro_step=670/global_step=670, RunningAvgSamplesPerSec=6.500087631742788, CurrSamplesPerSec=6.624966482824723, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:14:45,467] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:14:45,468] [INFO] [timer.py:259:stop] epoch=0/micro_step=680/global_step=680, RunningAvgSamplesPerSec=6.498007899925291, CurrSamplesPerSec=5.947993433089595, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:15:36,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:15:36,854] [INFO] [timer.py:259:stop] epoch=0/micro_step=690/global_step=690, RunningAvgSamplesPerSec=6.49853050461093, CurrSamplesPerSec=6.537276537217811, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:16:27,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:16:27,638] [INFO] [timer.py:259:stop] epoch=0/micro_step=700/global_step=700, RunningAvgSamplesPerSec=6.50067732949185, CurrSamplesPerSec=7.202987986575158, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:17:17,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:17:17,318] [INFO] [timer.py:259:stop] epoch=0/micro_step=710/global_step=710, RunningAvgSamplesPerSec=6.505122430903443, CurrSamplesPerSec=6.841183514758627, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:18:08,525] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:18:08,526] [INFO] [timer.py:259:stop] epoch=0/micro_step=720/global_step=720, RunningAvgSamplesPerSec=6.506134982966963, CurrSamplesPerSec=6.449693735118226, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:19:00,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:19:00,134] [INFO] [timer.py:259:stop] epoch=0/micro_step=730/global_step=730, RunningAvgSamplesPerSec=6.506460326959788, CurrSamplesPerSec=6.592413739028611, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:19:52,137] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:19:52,137] [INFO] [timer.py:259:stop] epoch=0/micro_step=740/global_step=740, RunningAvgSamplesPerSec=6.505210193956869, CurrSamplesPerSec=6.856634031852806, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:20:44,847] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:20:44,847] [INFO] [timer.py:259:stop] epoch=0/micro_step=750/global_step=750, RunningAvgSamplesPerSec=6.502994085173191, CurrSamplesPerSec=6.62402516248929, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:21:37,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:21:37,153] [INFO] [timer.py:259:stop] epoch=0/micro_step=760/global_step=760, RunningAvgSamplesPerSec=6.502329878833982, CurrSamplesPerSec=5.510403595219742, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:22:27,787] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:22:27,787] [INFO] [timer.py:259:stop] epoch=0/micro_step=770/global_step=770, RunningAvgSamplesPerSec=6.50425102002425, CurrSamplesPerSec=6.00948858136784, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:23:18,066] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:23:18,066] [INFO] [timer.py:259:stop] epoch=0/micro_step=780/global_step=780, RunningAvgSamplesPerSec=6.506466854927218, CurrSamplesPerSec=6.439721884358075, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:24:11,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:24:11,220] [INFO] [timer.py:259:stop] epoch=0/micro_step=790/global_step=790, RunningAvgSamplesPerSec=6.504125837467201, CurrSamplesPerSec=5.792755207598802, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:25:04,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:25:04,105] [INFO] [timer.py:259:stop] epoch=0/micro_step=800/global_step=800, RunningAvgSamplesPerSec=6.502291688114022, CurrSamplesPerSec=6.629611934009392, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:25:55,539] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:25:55,540] [INFO] [timer.py:259:stop] epoch=0/micro_step=810/global_step=810, RunningAvgSamplesPerSec=6.5031595847311054, CurrSamplesPerSec=7.775359022029377, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:26:46,546] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:26:46,547] [INFO] [timer.py:259:stop] epoch=0/micro_step=820/global_step=820, RunningAvgSamplesPerSec=6.505079636789253, CurrSamplesPerSec=7.5837857545048255, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:27:38,018] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:27:38,018] [INFO] [timer.py:259:stop] epoch=0/micro_step=830/global_step=830, RunningAvgSamplesPerSec=6.505500039645623, CurrSamplesPerSec=5.864550227891772, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:28:31,602] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:28:31,603] [INFO] [timer.py:259:stop] epoch=0/micro_step=840/global_step=840, RunningAvgSamplesPerSec=6.502768738501206, CurrSamplesPerSec=6.336721320928488, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:29:23,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:29:23,627] [INFO] [timer.py:259:stop] epoch=0/micro_step=850/global_step=850, RunningAvgSamplesPerSec=6.502242822967917, CurrSamplesPerSec=5.866018380686179, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:30:15,153] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:30:15,153] [INFO] [timer.py:259:stop] epoch=0/micro_step=860/global_step=860, RunningAvgSamplesPerSec=6.502423342326814, CurrSamplesPerSec=6.330063394618523, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:31:06,299] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:31:06,300] [INFO] [timer.py:259:stop] epoch=0/micro_step=870/global_step=870, RunningAvgSamplesPerSec=6.503141243172963, CurrSamplesPerSec=6.324054069609074, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 21:31:58,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:31:58,818] [INFO] [timer.py:259:stop] epoch=0/micro_step=880/global_step=880, RunningAvgSamplesPerSec=6.501776107617705, CurrSamplesPerSec=6.287204220881772, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:32:50,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:32:50,531] [INFO] [timer.py:259:stop] epoch=0/micro_step=890/global_step=890, RunningAvgSamplesPerSec=6.5021230224176705, CurrSamplesPerSec=6.558996057135132, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 21:33:43,484] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:33:43,485] [INFO] [timer.py:259:stop] epoch=0/micro_step=900/global_step=900, RunningAvgSamplesPerSec=6.500260729566549, CurrSamplesPerSec=7.140977402189486, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:34:34,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:34:34,903] [INFO] [timer.py:259:stop] epoch=0/micro_step=910/global_step=910, RunningAvgSamplesPerSec=6.500831579060029, CurrSamplesPerSec=6.36478553118225, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 21:35:26,793] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:35:26,794] [INFO] [timer.py:259:stop] epoch=0/micro_step=920/global_step=920, RunningAvgSamplesPerSec=6.500541183452432, CurrSamplesPerSec=5.979056147470803, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:36:19,032] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:36:19,032] [INFO] [timer.py:259:stop] epoch=0/micro_step=930/global_step=930, RunningAvgSamplesPerSec=6.499608457406355, CurrSamplesPerSec=6.327464156752752, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:37:10,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:37:10,594] [INFO] [timer.py:259:stop] epoch=0/micro_step=940/global_step=940, RunningAvgSamplesPerSec=6.500099976778154, CurrSamplesPerSec=6.5465713241064725, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 21:38:01,454] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:38:01,454] [INFO] [timer.py:259:stop] epoch=0/micro_step=950/global_step=950, RunningAvgSamplesPerSec=6.501439749859922, CurrSamplesPerSec=6.464056656957315, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:38:54,223] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:38:54,223] [INFO] [timer.py:259:stop] epoch=0/micro_step=960/global_step=960, RunningAvgSamplesPerSec=6.499890601544383, CurrSamplesPerSec=6.460069612485648, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:39:44,513] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:39:44,514] [INFO] [timer.py:259:stop] epoch=0/micro_step=970/global_step=970, RunningAvgSamplesPerSec=6.501810258356609, CurrSamplesPerSec=6.013510907325701, MemAllocated=25.42GB, MaxMemAllocated=33.76GB +[2024-10-23 21:40:34,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:40:34,679] [INFO] [timer.py:259:stop] epoch=0/micro_step=980/global_step=980, RunningAvgSamplesPerSec=6.5041096010356, CurrSamplesPerSec=6.464675922353583, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:41:26,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:41:26,080] [INFO] [timer.py:259:stop] epoch=0/micro_step=990/global_step=990, RunningAvgSamplesPerSec=6.50440652288022, CurrSamplesPerSec=6.538047175163655, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 21:42:18,063] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:42:18,063] [INFO] [timer.py:259:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=6.503971886655374, CurrSamplesPerSec=5.922946677005937, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:43:11,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=1010, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:43:11,011] [INFO] [timer.py:259:stop] epoch=0/micro_step=1010/global_step=1010, RunningAvgSamplesPerSec=6.502315347497243, CurrSamplesPerSec=7.77231933190782, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:44:01,323] [INFO] [logging.py:96:log_dist] [Rank 0] step=1020, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:44:01,324] [INFO] [timer.py:259:stop] epoch=0/micro_step=1020/global_step=1020, RunningAvgSamplesPerSec=6.503879874131471, CurrSamplesPerSec=7.175129016631096, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:44:52,117] [INFO] [logging.py:96:log_dist] [Rank 0] step=1030, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:44:52,118] [INFO] [timer.py:259:stop] epoch=0/micro_step=1030/global_step=1030, RunningAvgSamplesPerSec=6.5052587862599, CurrSamplesPerSec=7.530750470160596, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:45:45,498] [INFO] [logging.py:96:log_dist] [Rank 0] step=1040, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:45:45,499] [INFO] [timer.py:259:stop] epoch=0/micro_step=1040/global_step=1040, RunningAvgSamplesPerSec=6.503247070551364, CurrSamplesPerSec=6.067864980483656, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:46:37,049] [INFO] [logging.py:96:log_dist] [Rank 0] step=1050, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:46:37,050] [INFO] [timer.py:259:stop] epoch=0/micro_step=1050/global_step=1050, RunningAvgSamplesPerSec=6.503719215278464, CurrSamplesPerSec=7.150740537233256, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:47:30,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=1060, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:47:30,258] [INFO] [timer.py:259:stop] epoch=0/micro_step=1060/global_step=1060, RunningAvgSamplesPerSec=6.5018167389895885, CurrSamplesPerSec=6.439641551675323, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:48:21,965] [INFO] [logging.py:96:log_dist] [Rank 0] step=1070, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:48:21,965] [INFO] [timer.py:259:stop] epoch=0/micro_step=1070/global_step=1070, RunningAvgSamplesPerSec=6.5019965194732245, CurrSamplesPerSec=5.796384901407067, MemAllocated=25.42GB, MaxMemAllocated=33.76GB +[2024-10-23 21:49:13,334] [INFO] [logging.py:96:log_dist] [Rank 0] step=1080, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:49:13,334] [INFO] [timer.py:259:stop] epoch=0/micro_step=1080/global_step=1080, RunningAvgSamplesPerSec=6.502349747828737, CurrSamplesPerSec=5.926959686601864, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 21:50:04,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=1090, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:50:04,129] [INFO] [timer.py:259:stop] epoch=0/micro_step=1090/global_step=1090, RunningAvgSamplesPerSec=6.503570247447981, CurrSamplesPerSec=6.573427745201, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:50:57,977] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:50:57,979] [INFO] [timer.py:259:stop] epoch=0/micro_step=1100/global_step=1100, RunningAvgSamplesPerSec=6.501044677793804, CurrSamplesPerSec=5.126753389430814, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:51:49,168] [INFO] [logging.py:96:log_dist] [Rank 0] step=1110, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:51:49,169] [INFO] [timer.py:259:stop] epoch=0/micro_step=1110/global_step=1110, RunningAvgSamplesPerSec=6.501958726112883, CurrSamplesPerSec=7.673407921491517, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:52:39,811] [INFO] [logging.py:96:log_dist] [Rank 0] step=1120, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:52:39,811] [INFO] [timer.py:259:stop] epoch=0/micro_step=1120/global_step=1120, RunningAvgSamplesPerSec=6.50306073182378, CurrSamplesPerSec=6.925757259102822, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:53:31,645] [INFO] [logging.py:96:log_dist] [Rank 0] step=1130, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:53:31,646] [INFO] [timer.py:259:stop] epoch=0/micro_step=1130/global_step=1130, RunningAvgSamplesPerSec=6.503074398948989, CurrSamplesPerSec=6.3187819097410785, MemAllocated=25.41GB, MaxMemAllocated=33.76GB +[2024-10-23 21:54:22,058] [INFO] [logging.py:96:log_dist] [Rank 0] step=1140, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:54:22,059] [INFO] [timer.py:259:stop] epoch=0/micro_step=1140/global_step=1140, RunningAvgSamplesPerSec=6.504784223584924, CurrSamplesPerSec=6.751075147209535, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:55:12,598] [INFO] [logging.py:96:log_dist] [Rank 0] step=1150, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:55:12,599] [INFO] [timer.py:259:stop] epoch=0/micro_step=1150/global_step=1150, RunningAvgSamplesPerSec=6.506339785751833, CurrSamplesPerSec=6.576252043278759, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +[2024-10-23 21:56:03,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=1160, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:56:03,078] [INFO] [timer.py:259:stop] epoch=0/micro_step=1160/global_step=1160, RunningAvgSamplesPerSec=6.508004747514338, CurrSamplesPerSec=6.958410204345615, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:56:53,358] [INFO] [logging.py:96:log_dist] [Rank 0] step=1170, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:56:53,358] [INFO] [timer.py:259:stop] epoch=0/micro_step=1170/global_step=1170, RunningAvgSamplesPerSec=6.509839491170405, CurrSamplesPerSec=6.8265426431199066, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:57:45,225] [INFO] [logging.py:96:log_dist] [Rank 0] step=1180, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:57:45,226] [INFO] [timer.py:259:stop] epoch=0/micro_step=1180/global_step=1180, RunningAvgSamplesPerSec=6.509581246532201, CurrSamplesPerSec=6.317435507596419, MemAllocated=25.39GB, MaxMemAllocated=33.76GB +[2024-10-23 21:58:37,471] [INFO] [logging.py:96:log_dist] [Rank 0] step=1190, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 21:58:37,472] [INFO] [timer.py:259:stop] epoch=0/micro_step=1190/global_step=1190, RunningAvgSamplesPerSec=6.509039349109207, CurrSamplesPerSec=6.31738347131873, MemAllocated=25.4GB, MaxMemAllocated=33.76GB +Saving checkpoint at step 1196 ... +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-23 21:59:20,258] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step1196 is about to be saved! +[2024-10-23 21:59:20,259] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1196.bin, tag: global_step1196 +[2024-10-23 21:59:20,259] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1196.bin... +[2024-10-23 21:59:47,521] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1196.bin. +[2024-10-23 21:59:47,521] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1196 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-23 22:00:01,620] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step1196 is about to be saved! +[2024-10-23 22:00:01,621] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1196.bin, tag: global_step1196 +[2024-10-23 22:00:01,621] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1196.bin... +[2024-10-23 22:00:39,609] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1196.bin. +[2024-10-23 22:00:39,612] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1196 is ready now! +Model saved! +Checkpoint saved. + +***** Evaluating at epoch 1/5.0 ***** + +***** Evaluating at the beginning ***** + +Evaluation: accuracy = 0.686579, reward_mean = 20.877401, reward_std = 4.553088 +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ +┃ prompt  ┃ higher-reward response  ┃ reward  ┃ lower-reward response  ┃ reward  ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ +│ USER: │ There are two bags visible in the image. │ 18.375000 │ There are four bags visible in the │ 18.375000 │ +│ How many bags are visible in the image? │ │ │ image. │ │ +│ │ │ │ │ │ +├──────────────────────────────────────────┼──────────────────────────────────────────┼───────────┼─────────────────────────────────────────┼───────────┤ +│ USER: │ Based on the signs and advertisements │ 26.250000 │ Based on the presence of multiple │ 21.500000 │ +│ What can you infer about the purpose of │ visible in the image, it can be inferred │ │ signs, including a large red sign that │ │ +│ the building with the pyramid-shaped │ that the building with the │ │ reads "Sale," it is likely that this │ │ +│ structure based on the signs and │ pyramid-shaped structure is likely a │ │ building serves as an outlet for retail │ │ +│ advertisements? │ shopping center or a commercial complex. │ │ or commercial purposes. The pyramid │ │ +│ │ The signs include names of various │ │ shape suggests that there might be some │ │ +│ │ brands and stores, such as "Ochsner," │ │ architectural significance to the │ │ +│ │ "Manor," and "Traita fina," which are │ │ design, but without more context, we │ │ +│ │ typically associated with retail and │ │ cannot definitively determine its │ │ +│ │ shopping. Additionally, the large text │ │ specific function. However, given the │ │ +│ │ "Shopping Center St. Jakob-Park" further │ │ promotional nature of the signs, one │ │ +│ │ supports the idea that this building is │ │ could reasonably assume that the │ │ +│ │ a shopping center. The prominent display │ │ building houses shops or businesses │ │ +│ │ of the word "BASEL" suggests that this │ │ offering discounts or special deals to │ │ +│ │ shopping center is located in or near │ │ attract customers. │ │ +│ │ Basel, Switzerland. │ │ │ │ +├──────────────────────────────────────────┼──────────────────────────────────────────┼───────────┼─────────────────────────────────────────┼───────────┤ +│ USER: │ Based on the lighting and shadows in the │ 23.500000 │ The image shows a plane at an airport │ 18.750000 │ +│ Based on the lighting and shadows, can │ image, it appears that the sun is │ │ terminal with lights in the distance. │ │ +│ you infer if the sun is rising or │ setting. The sky has a gradient from a │ │ The sky has hues of yellow and orange │ │ +│ setting? │ darker blue at the top to a lighter │ │ near the horizon which suggests that │ │ +│ │ yellow-orange near the horizon, which is │ │ it's either sunrise or sunset. However, │ │ +│ │ typical of sunset. Additionally, the │ │ there are no direct sunlight rays │ │ +│ │ lights at the airport are on, which is │ │ visible, so we cannot determine whether │ │ +│ │ more common during the evening as it │ │ this photo was taken during dawn │ │ +│ │ gets darker. │ │ (sunrise) or dusk (sunset). It could be │ │ +│ │ │ │ either one depending on where the │ │ +│ │ │ │ photographer took the picture from. If │ │ +│ │ │ │ they were standing facing east when │ │ +│ │ │ │ taking the shot, then it would likely │ │ +│ │ │ │ be morning; but if they faced west │ │ +│ │ │ │ while capturing the scene, it might │ │ +│ │ │ │ have been evening instead. │ │ +│ │ │ │ │ │ +│ │ │ │ It appears to be early morning based │ │ +│ │ │ │ off how dark everything else looks │ │ +│ │ │ │ except for those areas around the │ │ +│ │ │ │ horizon line indicating light coming up │ │ +│ │ │ │ over mountains behind them - suggesting │ │ +│ │ │ │ daytime hours ahead rather than │ │ +│ │ │ │ nighttime ones since all other parts │ │ +│ │ │ │ remain shadowed by darkness still │ │ +│ │ │ │ lingering outside city limits below us │ │ +│ │ │ │ here today... │ │ +└──────────────────────────────────────────┴──────────────────────────────────────────┴───────────┴─────────────────────────────────────────┴───────────┘ +[2024-10-23 22:21:20,694] [WARNING] [stage3.py:2102:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time +[2024-10-23 22:21:36,366] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:21:36,367] [INFO] [timer.py:259:stop] epoch=1/micro_step=4/global_step=1200, RunningAvgSamplesPerSec=6.510372992350091, CurrSamplesPerSec=6.432772186434626, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:22:26,969] [INFO] [logging.py:96:log_dist] [Rank 0] step=1210, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:22:26,970] [INFO] [timer.py:259:stop] epoch=1/micro_step=14/global_step=1210, RunningAvgSamplesPerSec=6.511788119766364, CurrSamplesPerSec=7.220651315555799, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:23:19,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=1220, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:23:19,635] [INFO] [timer.py:259:stop] epoch=1/micro_step=24/global_step=1220, RunningAvgSamplesPerSec=6.510494571222952, CurrSamplesPerSec=6.516289092272593, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:24:11,490] [INFO] [logging.py:96:log_dist] [Rank 0] step=1230, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:24:11,491] [INFO] [timer.py:259:stop] epoch=1/micro_step=34/global_step=1230, RunningAvgSamplesPerSec=6.510243489029112, CurrSamplesPerSec=6.6143280597100125, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:25:06,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=1240, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:25:06,434] [INFO] [timer.py:259:stop] epoch=1/micro_step=44/global_step=1240, RunningAvgSamplesPerSec=6.506759964502863, CurrSamplesPerSec=5.952372006021049, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:25:59,206] [INFO] [logging.py:96:log_dist] [Rank 0] step=1250, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:25:59,207] [INFO] [timer.py:259:stop] epoch=1/micro_step=54/global_step=1250, RunningAvgSamplesPerSec=6.50597117531924, CurrSamplesPerSec=6.607129949226739, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:26:49,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=1260, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:26:49,387] [INFO] [timer.py:259:stop] epoch=1/micro_step=64/global_step=1260, RunningAvgSamplesPerSec=6.507866639753691, CurrSamplesPerSec=6.245524501789516, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:27:41,028] [INFO] [logging.py:96:log_dist] [Rank 0] step=1270, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:27:41,029] [INFO] [timer.py:259:stop] epoch=1/micro_step=74/global_step=1270, RunningAvgSamplesPerSec=6.507888080720249, CurrSamplesPerSec=6.172499627406796, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:28:35,034] [INFO] [logging.py:96:log_dist] [Rank 0] step=1280, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:28:35,035] [INFO] [timer.py:259:stop] epoch=1/micro_step=84/global_step=1280, RunningAvgSamplesPerSec=6.5056326506506075, CurrSamplesPerSec=5.3511154669334715, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:29:24,393] [INFO] [logging.py:96:log_dist] [Rank 0] step=1290, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:29:24,394] [INFO] [timer.py:259:stop] epoch=1/micro_step=94/global_step=1290, RunningAvgSamplesPerSec=6.508363053170466, CurrSamplesPerSec=7.291966356591223, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:30:13,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:30:13,823] [INFO] [timer.py:259:stop] epoch=1/micro_step=104/global_step=1300, RunningAvgSamplesPerSec=6.51057089105971, CurrSamplesPerSec=6.439950225504847, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 22:31:05,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=1310, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:31:05,173] [INFO] [timer.py:259:stop] epoch=1/micro_step=114/global_step=1310, RunningAvgSamplesPerSec=6.51097803301331, CurrSamplesPerSec=5.856354093072096, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:31:58,734] [INFO] [logging.py:96:log_dist] [Rank 0] step=1320, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:31:58,734] [INFO] [timer.py:259:stop] epoch=1/micro_step=124/global_step=1320, RunningAvgSamplesPerSec=6.5093740644334535, CurrSamplesPerSec=6.468539902301167, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 22:32:53,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=1330, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:32:53,183] [INFO] [timer.py:259:stop] epoch=1/micro_step=134/global_step=1330, RunningAvgSamplesPerSec=6.506551362655965, CurrSamplesPerSec=7.02600033915186, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:33:44,374] [INFO] [logging.py:96:log_dist] [Rank 0] step=1340, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:33:44,375] [INFO] [timer.py:259:stop] epoch=1/micro_step=144/global_step=1340, RunningAvgSamplesPerSec=6.507257854139102, CurrSamplesPerSec=6.478431782734893, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 22:34:35,091] [INFO] [logging.py:96:log_dist] [Rank 0] step=1350, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:34:35,091] [INFO] [timer.py:259:stop] epoch=1/micro_step=154/global_step=1350, RunningAvgSamplesPerSec=6.508447975758948, CurrSamplesPerSec=7.55156336284774, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:35:26,627] [INFO] [logging.py:96:log_dist] [Rank 0] step=1360, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:35:26,628] [INFO] [timer.py:259:stop] epoch=1/micro_step=164/global_step=1360, RunningAvgSamplesPerSec=6.508624064280533, CurrSamplesPerSec=7.100319319833728, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:36:17,179] [INFO] [logging.py:96:log_dist] [Rank 0] step=1370, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:36:17,180] [INFO] [timer.py:259:stop] epoch=1/micro_step=174/global_step=1370, RunningAvgSamplesPerSec=6.50957582820173, CurrSamplesPerSec=6.876201212261025, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:37:09,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=1380, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:37:09,501] [INFO] [timer.py:259:stop] epoch=1/micro_step=184/global_step=1380, RunningAvgSamplesPerSec=6.5092675855253725, CurrSamplesPerSec=6.677660632996622, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:37:58,648] [INFO] [logging.py:96:log_dist] [Rank 0] step=1390, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:37:58,649] [INFO] [timer.py:259:stop] epoch=1/micro_step=194/global_step=1390, RunningAvgSamplesPerSec=6.511581747232183, CurrSamplesPerSec=7.015227233389059, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:38:52,107] [INFO] [logging.py:96:log_dist] [Rank 0] step=1400, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:38:52,108] [INFO] [timer.py:259:stop] epoch=1/micro_step=204/global_step=1400, RunningAvgSamplesPerSec=6.510089838629287, CurrSamplesPerSec=6.394118205227861, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 22:39:44,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=1410, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:39:44,200] [INFO] [timer.py:259:stop] epoch=1/micro_step=214/global_step=1410, RunningAvgSamplesPerSec=6.5098583574371105, CurrSamplesPerSec=6.373420418987023, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 22:40:37,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=1420, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:40:37,274] [INFO] [timer.py:259:stop] epoch=1/micro_step=224/global_step=1420, RunningAvgSamplesPerSec=6.50855956202567, CurrSamplesPerSec=5.520250099589652, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:41:30,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=1430, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:41:30,383] [INFO] [timer.py:259:stop] epoch=1/micro_step=234/global_step=1430, RunningAvgSamplesPerSec=6.50731478908686, CurrSamplesPerSec=6.4448747755148545, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:42:20,395] [INFO] [logging.py:96:log_dist] [Rank 0] step=1440, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:42:20,395] [INFO] [timer.py:259:stop] epoch=1/micro_step=244/global_step=1440, RunningAvgSamplesPerSec=6.509282073308986, CurrSamplesPerSec=7.122277255714569, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:43:10,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=1450, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:43:10,395] [INFO] [timer.py:259:stop] epoch=1/micro_step=254/global_step=1450, RunningAvgSamplesPerSec=6.510528250857819, CurrSamplesPerSec=6.558520108818613, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 22:44:01,697] [INFO] [logging.py:96:log_dist] [Rank 0] step=1460, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:44:01,697] [INFO] [timer.py:259:stop] epoch=1/micro_step=264/global_step=1460, RunningAvgSamplesPerSec=6.510540886836154, CurrSamplesPerSec=5.838156906130823, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:44:53,975] [INFO] [logging.py:96:log_dist] [Rank 0] step=1470, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:44:53,975] [INFO] [timer.py:259:stop] epoch=1/micro_step=274/global_step=1470, RunningAvgSamplesPerSec=6.510180012809286, CurrSamplesPerSec=7.400193237641192, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:45:46,405] [INFO] [logging.py:96:log_dist] [Rank 0] step=1480, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:45:46,406] [INFO] [timer.py:259:stop] epoch=1/micro_step=284/global_step=1480, RunningAvgSamplesPerSec=6.50953291095795, CurrSamplesPerSec=6.601563454818184, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 22:46:37,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=1490, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:46:37,948] [INFO] [timer.py:259:stop] epoch=1/micro_step=294/global_step=1490, RunningAvgSamplesPerSec=6.509709428548319, CurrSamplesPerSec=6.372442113953131, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 22:47:31,286] [INFO] [logging.py:96:log_dist] [Rank 0] step=1500, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:47:31,287] [INFO] [timer.py:259:stop] epoch=1/micro_step=304/global_step=1500, RunningAvgSamplesPerSec=6.50800588514185, CurrSamplesPerSec=6.035070677772201, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:48:24,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=1510, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:48:24,144] [INFO] [timer.py:259:stop] epoch=1/micro_step=314/global_step=1510, RunningAvgSamplesPerSec=6.5069589879444205, CurrSamplesPerSec=5.680080405110502, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:49:14,302] [INFO] [logging.py:96:log_dist] [Rank 0] step=1520, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:49:14,302] [INFO] [timer.py:259:stop] epoch=1/micro_step=324/global_step=1520, RunningAvgSamplesPerSec=6.508214850285554, CurrSamplesPerSec=7.601090296562946, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:50:04,992] [INFO] [logging.py:96:log_dist] [Rank 0] step=1530, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:50:04,992] [INFO] [timer.py:259:stop] epoch=1/micro_step=334/global_step=1530, RunningAvgSamplesPerSec=6.509073033411982, CurrSamplesPerSec=5.923610646553887, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:50:58,697] [INFO] [logging.py:96:log_dist] [Rank 0] step=1540, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:50:58,698] [INFO] [timer.py:259:stop] epoch=1/micro_step=344/global_step=1540, RunningAvgSamplesPerSec=6.507481069267217, CurrSamplesPerSec=6.67873756397677, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:51:49,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=1550, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:51:49,673] [INFO] [timer.py:259:stop] epoch=1/micro_step=354/global_step=1550, RunningAvgSamplesPerSec=6.507743549102243, CurrSamplesPerSec=7.651841718854222, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:52:39,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=1560, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:52:39,554] [INFO] [timer.py:259:stop] epoch=1/micro_step=364/global_step=1560, RunningAvgSamplesPerSec=6.5091491490535605, CurrSamplesPerSec=6.918888532798715, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:53:31,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=1570, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:53:31,748] [INFO] [timer.py:259:stop] epoch=1/micro_step=374/global_step=1570, RunningAvgSamplesPerSec=6.5088993306284175, CurrSamplesPerSec=6.332872134470621, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:54:22,178] [INFO] [logging.py:96:log_dist] [Rank 0] step=1580, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:54:22,179] [INFO] [timer.py:259:stop] epoch=1/micro_step=384/global_step=1580, RunningAvgSamplesPerSec=6.509858658940349, CurrSamplesPerSec=6.337727592401103, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:55:11,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=1590, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:55:11,558] [INFO] [timer.py:259:stop] epoch=1/micro_step=394/global_step=1590, RunningAvgSamplesPerSec=6.511557430996192, CurrSamplesPerSec=5.997850748700303, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-23 22:56:05,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=1600, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:56:05,938] [INFO] [timer.py:259:stop] epoch=1/micro_step=404/global_step=1600, RunningAvgSamplesPerSec=6.509307530020676, CurrSamplesPerSec=5.791692352564736, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:56:58,021] [INFO] [logging.py:96:log_dist] [Rank 0] step=1610, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:56:58,022] [INFO] [timer.py:259:stop] epoch=1/micro_step=414/global_step=1610, RunningAvgSamplesPerSec=6.509252262706876, CurrSamplesPerSec=7.1956400503102, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:57:47,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=1620, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:57:47,578] [INFO] [timer.py:259:stop] epoch=1/micro_step=424/global_step=1620, RunningAvgSamplesPerSec=6.5110074381666845, CurrSamplesPerSec=6.876137802550521, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 22:58:38,326] [INFO] [logging.py:96:log_dist] [Rank 0] step=1630, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:58:38,326] [INFO] [timer.py:259:stop] epoch=1/micro_step=434/global_step=1630, RunningAvgSamplesPerSec=6.51183452538032, CurrSamplesPerSec=6.551694252164036, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 22:59:30,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=1640, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 22:59:30,116] [INFO] [timer.py:259:stop] epoch=1/micro_step=444/global_step=1640, RunningAvgSamplesPerSec=6.51141058012788, CurrSamplesPerSec=6.220746852253601, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:00:23,015] [INFO] [logging.py:96:log_dist] [Rank 0] step=1650, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:00:23,016] [INFO] [timer.py:259:stop] epoch=1/micro_step=454/global_step=1650, RunningAvgSamplesPerSec=6.510500668403683, CurrSamplesPerSec=5.82872992670339, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:01:11,337] [INFO] [logging.py:96:log_dist] [Rank 0] step=1660, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:01:11,338] [INFO] [timer.py:259:stop] epoch=1/micro_step=464/global_step=1660, RunningAvgSamplesPerSec=6.513459000035857, CurrSamplesPerSec=6.575156047351178, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:02:03,137] [INFO] [logging.py:96:log_dist] [Rank 0] step=1670, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:02:03,138] [INFO] [timer.py:259:stop] epoch=1/micro_step=474/global_step=1670, RunningAvgSamplesPerSec=6.51346588904355, CurrSamplesPerSec=6.498601398917402, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:02:53,018] [INFO] [logging.py:96:log_dist] [Rank 0] step=1680, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:02:53,018] [INFO] [timer.py:259:stop] epoch=1/micro_step=484/global_step=1680, RunningAvgSamplesPerSec=6.514920136813763, CurrSamplesPerSec=7.307065411712798, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:03:46,565] [INFO] [logging.py:96:log_dist] [Rank 0] step=1690, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:03:46,566] [INFO] [timer.py:259:stop] epoch=1/micro_step=494/global_step=1690, RunningAvgSamplesPerSec=6.513554441856593, CurrSamplesPerSec=6.475659932729258, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:04:38,867] [INFO] [logging.py:96:log_dist] [Rank 0] step=1700, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:04:38,868] [INFO] [timer.py:259:stop] epoch=1/micro_step=504/global_step=1700, RunningAvgSamplesPerSec=6.513128029732634, CurrSamplesPerSec=6.554949015451529, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:05:29,757] [INFO] [logging.py:96:log_dist] [Rank 0] step=1710, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:05:29,758] [INFO] [timer.py:259:stop] epoch=1/micro_step=514/global_step=1710, RunningAvgSamplesPerSec=6.513751387398357, CurrSamplesPerSec=6.362731056507902, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:06:23,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=1720, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:06:23,766] [INFO] [timer.py:259:stop] epoch=1/micro_step=524/global_step=1720, RunningAvgSamplesPerSec=6.511985853253418, CurrSamplesPerSec=5.915006538069898, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:07:15,186] [INFO] [logging.py:96:log_dist] [Rank 0] step=1730, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:07:15,186] [INFO] [timer.py:259:stop] epoch=1/micro_step=534/global_step=1730, RunningAvgSamplesPerSec=6.5121454619620325, CurrSamplesPerSec=6.499086942579648, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:08:04,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=1740, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:08:04,674] [INFO] [timer.py:259:stop] epoch=1/micro_step=544/global_step=1740, RunningAvgSamplesPerSec=6.51367612573072, CurrSamplesPerSec=6.380836910718298, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:08:57,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=1750, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:08:57,002] [INFO] [timer.py:259:stop] epoch=1/micro_step=554/global_step=1750, RunningAvgSamplesPerSec=6.513220746439107, CurrSamplesPerSec=7.774768998462878, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:09:50,099] [INFO] [logging.py:96:log_dist] [Rank 0] step=1760, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:09:50,100] [INFO] [timer.py:259:stop] epoch=1/micro_step=564/global_step=1760, RunningAvgSamplesPerSec=6.5122387262911365, CurrSamplesPerSec=6.043781858813163, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:10:40,758] [INFO] [logging.py:96:log_dist] [Rank 0] step=1770, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:10:40,758] [INFO] [timer.py:259:stop] epoch=1/micro_step=574/global_step=1770, RunningAvgSamplesPerSec=6.5132132992102765, CurrSamplesPerSec=6.5381131018035585, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:11:33,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=1780, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:11:33,889] [INFO] [timer.py:259:stop] epoch=1/micro_step=584/global_step=1780, RunningAvgSamplesPerSec=6.512297189434477, CurrSamplesPerSec=6.606684713510541, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:12:27,525] [INFO] [logging.py:96:log_dist] [Rank 0] step=1790, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:12:27,526] [INFO] [timer.py:259:stop] epoch=1/micro_step=594/global_step=1790, RunningAvgSamplesPerSec=6.5108729169153206, CurrSamplesPerSec=5.464964591875034, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +Saving checkpoint at step 1794 ... +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-23 23:13:02,734] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step1794 is about to be saved! +[2024-10-23 23:13:02,735] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1794.bin, tag: global_step1794 +[2024-10-23 23:13:02,735] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1794.bin... +[2024-10-23 23:13:38,460] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1794.bin. +[2024-10-23 23:13:38,461] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1794 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-23 23:13:51,279] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step1794 is about to be saved! +[2024-10-23 23:13:51,280] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1794.bin, tag: global_step1794 +[2024-10-23 23:13:51,280] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1794.bin... +[2024-10-23 23:14:19,956] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_1794.bin. +[2024-10-23 23:14:19,958] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1794 is ready now! +Model saved! +Checkpoint saved. +[2024-10-23 23:14:49,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=1800, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:14:49,381] [INFO] [timer.py:259:stop] epoch=1/micro_step=604/global_step=1800, RunningAvgSamplesPerSec=6.511878879095428, CurrSamplesPerSec=6.552635919666536, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:15:39,281] [INFO] [logging.py:96:log_dist] [Rank 0] step=1810, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:15:39,281] [INFO] [timer.py:259:stop] epoch=1/micro_step=614/global_step=1810, RunningAvgSamplesPerSec=6.513065815855339, CurrSamplesPerSec=7.398000806206072, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:16:28,873] [INFO] [logging.py:96:log_dist] [Rank 0] step=1820, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:16:28,873] [INFO] [timer.py:259:stop] epoch=1/micro_step=624/global_step=1820, RunningAvgSamplesPerSec=6.514397244196447, CurrSamplesPerSec=6.403117914008447, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:17:22,062] [INFO] [logging.py:96:log_dist] [Rank 0] step=1830, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:17:22,063] [INFO] [timer.py:259:stop] epoch=1/micro_step=634/global_step=1830, RunningAvgSamplesPerSec=6.513392216412993, CurrSamplesPerSec=5.862188833942031, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:18:13,929] [INFO] [logging.py:96:log_dist] [Rank 0] step=1840, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:18:13,930] [INFO] [timer.py:259:stop] epoch=1/micro_step=644/global_step=1840, RunningAvgSamplesPerSec=6.513044069194428, CurrSamplesPerSec=5.707335568101138, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:19:05,959] [INFO] [logging.py:96:log_dist] [Rank 0] step=1850, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:19:05,960] [INFO] [timer.py:259:stop] epoch=1/micro_step=654/global_step=1850, RunningAvgSamplesPerSec=6.512600150881733, CurrSamplesPerSec=6.811721611177343, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:19:57,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=1860, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:19:57,686] [INFO] [timer.py:259:stop] epoch=1/micro_step=664/global_step=1860, RunningAvgSamplesPerSec=6.512804408620647, CurrSamplesPerSec=5.14584145172935, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:20:50,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=1870, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:20:50,446] [INFO] [timer.py:259:stop] epoch=1/micro_step=674/global_step=1870, RunningAvgSamplesPerSec=6.512128427071808, CurrSamplesPerSec=5.826343413063019, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:21:43,949] [INFO] [logging.py:96:log_dist] [Rank 0] step=1880, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:21:43,950] [INFO] [timer.py:259:stop] epoch=1/micro_step=684/global_step=1880, RunningAvgSamplesPerSec=6.510806819468989, CurrSamplesPerSec=6.512994849905926, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:22:34,610] [INFO] [logging.py:96:log_dist] [Rank 0] step=1890, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:22:34,610] [INFO] [timer.py:259:stop] epoch=1/micro_step=694/global_step=1890, RunningAvgSamplesPerSec=6.511457866459239, CurrSamplesPerSec=6.681465836570054, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:23:24,334] [INFO] [logging.py:96:log_dist] [Rank 0] step=1900, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:23:24,334] [INFO] [timer.py:259:stop] epoch=1/micro_step=704/global_step=1900, RunningAvgSamplesPerSec=6.512859958070175, CurrSamplesPerSec=7.170838619615155, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:24:15,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=1910, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:24:15,226] [INFO] [timer.py:259:stop] epoch=1/micro_step=714/global_step=1910, RunningAvgSamplesPerSec=6.513837897565748, CurrSamplesPerSec=6.970084611243706, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:25:05,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=1920, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:25:05,554] [INFO] [timer.py:259:stop] epoch=1/micro_step=724/global_step=1920, RunningAvgSamplesPerSec=6.514832430369723, CurrSamplesPerSec=6.55761167199552, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:25:57,285] [INFO] [logging.py:96:log_dist] [Rank 0] step=1930, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:25:57,285] [INFO] [timer.py:259:stop] epoch=1/micro_step=734/global_step=1930, RunningAvgSamplesPerSec=6.514725493305741, CurrSamplesPerSec=7.197905621825658, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:26:50,672] [INFO] [logging.py:96:log_dist] [Rank 0] step=1940, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:26:50,673] [INFO] [timer.py:259:stop] epoch=1/micro_step=744/global_step=1940, RunningAvgSamplesPerSec=6.513474948778389, CurrSamplesPerSec=6.1988198117093685, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:27:41,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=1950, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:27:41,674] [INFO] [timer.py:259:stop] epoch=1/micro_step=754/global_step=1950, RunningAvgSamplesPerSec=6.513957106505292, CurrSamplesPerSec=6.541470429170818, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:28:33,797] [INFO] [logging.py:96:log_dist] [Rank 0] step=1960, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:28:33,798] [INFO] [timer.py:259:stop] epoch=1/micro_step=764/global_step=1960, RunningAvgSamplesPerSec=6.513554089621681, CurrSamplesPerSec=6.354727982793316, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:29:24,437] [INFO] [logging.py:96:log_dist] [Rank 0] step=1970, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:29:24,437] [INFO] [timer.py:259:stop] epoch=1/micro_step=774/global_step=1970, RunningAvgSamplesPerSec=6.513997999804782, CurrSamplesPerSec=7.243201522998605, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:30:16,017] [INFO] [logging.py:96:log_dist] [Rank 0] step=1980, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:30:16,017] [INFO] [timer.py:259:stop] epoch=1/micro_step=784/global_step=1980, RunningAvgSamplesPerSec=6.514051298093842, CurrSamplesPerSec=6.652361519665567, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:31:09,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=1990, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:31:09,740] [INFO] [timer.py:259:stop] epoch=1/micro_step=794/global_step=1990, RunningAvgSamplesPerSec=6.512474719247795, CurrSamplesPerSec=5.290797013687399, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:32:00,929] [INFO] [logging.py:96:log_dist] [Rank 0] step=2000, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:32:00,930] [INFO] [timer.py:259:stop] epoch=1/micro_step=804/global_step=2000, RunningAvgSamplesPerSec=6.512612831077869, CurrSamplesPerSec=6.981626113047358, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:32:52,648] [INFO] [logging.py:96:log_dist] [Rank 0] step=2010, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:32:52,648] [INFO] [timer.py:259:stop] epoch=1/micro_step=814/global_step=2010, RunningAvgSamplesPerSec=6.5126407817131415, CurrSamplesPerSec=6.434670080924604, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:33:43,162] [INFO] [logging.py:96:log_dist] [Rank 0] step=2020, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:33:43,163] [INFO] [timer.py:259:stop] epoch=1/micro_step=824/global_step=2020, RunningAvgSamplesPerSec=6.5138169032647895, CurrSamplesPerSec=6.503061182401223, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:34:35,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=2030, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:34:35,373] [INFO] [timer.py:259:stop] epoch=1/micro_step=834/global_step=2030, RunningAvgSamplesPerSec=6.513629760645536, CurrSamplesPerSec=6.779030162185176, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:35:30,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=2040, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:35:30,013] [INFO] [timer.py:259:stop] epoch=1/micro_step=844/global_step=2040, RunningAvgSamplesPerSec=6.511664755041628, CurrSamplesPerSec=6.313988396646313, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:36:20,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=2050, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:36:20,932] [INFO] [timer.py:259:stop] epoch=1/micro_step=854/global_step=2050, RunningAvgSamplesPerSec=6.511857436378962, CurrSamplesPerSec=5.884874474560218, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:37:12,058] [INFO] [logging.py:96:log_dist] [Rank 0] step=2060, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:37:12,059] [INFO] [timer.py:259:stop] epoch=1/micro_step=864/global_step=2060, RunningAvgSamplesPerSec=6.512096075167476, CurrSamplesPerSec=6.762779849193039, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:38:04,127] [INFO] [logging.py:96:log_dist] [Rank 0] step=2070, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:38:04,128] [INFO] [timer.py:259:stop] epoch=1/micro_step=874/global_step=2070, RunningAvgSamplesPerSec=6.51177586954201, CurrSamplesPerSec=6.360953433309131, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:38:56,221] [INFO] [logging.py:96:log_dist] [Rank 0] step=2080, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:38:56,221] [INFO] [timer.py:259:stop] epoch=1/micro_step=884/global_step=2080, RunningAvgSamplesPerSec=6.511423265405062, CurrSamplesPerSec=6.33342408010272, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:39:49,435] [INFO] [logging.py:96:log_dist] [Rank 0] step=2090, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:39:49,435] [INFO] [timer.py:259:stop] epoch=1/micro_step=894/global_step=2090, RunningAvgSamplesPerSec=6.510342089940063, CurrSamplesPerSec=6.166743387076943, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:40:39,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=2100, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:40:39,854] [INFO] [timer.py:259:stop] epoch=1/micro_step=904/global_step=2100, RunningAvgSamplesPerSec=6.51114912040494, CurrSamplesPerSec=6.778111987749384, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:41:31,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=2110, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:41:31,817] [INFO] [timer.py:259:stop] epoch=1/micro_step=914/global_step=2110, RunningAvgSamplesPerSec=6.511108947296892, CurrSamplesPerSec=6.351319710648676, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:42:23,630] [INFO] [logging.py:96:log_dist] [Rank 0] step=2120, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:42:23,630] [INFO] [timer.py:259:stop] epoch=1/micro_step=924/global_step=2120, RunningAvgSamplesPerSec=6.510816808459944, CurrSamplesPerSec=5.753543829871893, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:43:15,301] [INFO] [logging.py:96:log_dist] [Rank 0] step=2130, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:43:15,302] [INFO] [timer.py:259:stop] epoch=1/micro_step=934/global_step=2130, RunningAvgSamplesPerSec=6.510790269046262, CurrSamplesPerSec=6.689270625911826, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:44:07,614] [INFO] [logging.py:96:log_dist] [Rank 0] step=2140, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:44:07,614] [INFO] [timer.py:259:stop] epoch=1/micro_step=944/global_step=2140, RunningAvgSamplesPerSec=6.510535719273972, CurrSamplesPerSec=7.175981036401915, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:44:58,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=2150, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:44:58,364] [INFO] [timer.py:259:stop] epoch=1/micro_step=954/global_step=2150, RunningAvgSamplesPerSec=6.511255506212154, CurrSamplesPerSec=6.943621599779497, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:45:51,588] [INFO] [logging.py:96:log_dist] [Rank 0] step=2160, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:45:51,588] [INFO] [timer.py:259:stop] epoch=1/micro_step=964/global_step=2160, RunningAvgSamplesPerSec=6.510301650610165, CurrSamplesPerSec=6.742755631848933, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:46:41,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=2170, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:46:41,110] [INFO] [timer.py:259:stop] epoch=1/micro_step=974/global_step=2170, RunningAvgSamplesPerSec=6.511658793652696, CurrSamplesPerSec=6.910471815654388, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:47:31,653] [INFO] [logging.py:96:log_dist] [Rank 0] step=2180, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:47:31,654] [INFO] [timer.py:259:stop] epoch=1/micro_step=984/global_step=2180, RunningAvgSamplesPerSec=6.512170853969847, CurrSamplesPerSec=6.224544328718742, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:48:22,741] [INFO] [logging.py:96:log_dist] [Rank 0] step=2190, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:48:22,742] [INFO] [timer.py:259:stop] epoch=1/micro_step=994/global_step=2190, RunningAvgSamplesPerSec=6.5128103750289865, CurrSamplesPerSec=6.37189968203646, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-23 23:49:16,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=2200, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:49:16,467] [INFO] [timer.py:259:stop] epoch=1/micro_step=1004/global_step=2200, RunningAvgSamplesPerSec=6.511606712962667, CurrSamplesPerSec=6.665791919614708, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:50:08,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=2210, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:50:08,822] [INFO] [timer.py:259:stop] epoch=1/micro_step=1014/global_step=2210, RunningAvgSamplesPerSec=6.511055753877861, CurrSamplesPerSec=7.353656581442639, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:50:59,164] [INFO] [logging.py:96:log_dist] [Rank 0] step=2220, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:50:59,165] [INFO] [timer.py:259:stop] epoch=1/micro_step=1024/global_step=2220, RunningAvgSamplesPerSec=6.512003121817875, CurrSamplesPerSec=5.828293062739078, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-23 23:51:52,089] [INFO] [logging.py:96:log_dist] [Rank 0] step=2230, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:51:52,090] [INFO] [timer.py:259:stop] epoch=1/micro_step=1034/global_step=2230, RunningAvgSamplesPerSec=6.511377607591924, CurrSamplesPerSec=5.238849320541142, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:52:43,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=2240, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:52:43,791] [INFO] [timer.py:259:stop] epoch=1/micro_step=1044/global_step=2240, RunningAvgSamplesPerSec=6.5115315707129025, CurrSamplesPerSec=6.283406408270614, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:53:35,629] [INFO] [logging.py:96:log_dist] [Rank 0] step=2250, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:53:35,630] [INFO] [timer.py:259:stop] epoch=1/micro_step=1054/global_step=2250, RunningAvgSamplesPerSec=6.511507330126153, CurrSamplesPerSec=6.972151318364258, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:54:27,975] [INFO] [logging.py:96:log_dist] [Rank 0] step=2260, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:54:27,975] [INFO] [timer.py:259:stop] epoch=1/micro_step=1064/global_step=2260, RunningAvgSamplesPerSec=6.511174248467307, CurrSamplesPerSec=6.871670011758988, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:55:19,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=2270, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:55:19,494] [INFO] [timer.py:259:stop] epoch=1/micro_step=1074/global_step=2270, RunningAvgSamplesPerSec=6.511294556660326, CurrSamplesPerSec=7.497689348808838, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:56:11,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=2280, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:56:11,133] [INFO] [timer.py:259:stop] epoch=1/micro_step=1084/global_step=2280, RunningAvgSamplesPerSec=6.511325094288139, CurrSamplesPerSec=6.290221486741889, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:57:02,753] [INFO] [logging.py:96:log_dist] [Rank 0] step=2290, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:57:02,754] [INFO] [timer.py:259:stop] epoch=1/micro_step=1094/global_step=2290, RunningAvgSamplesPerSec=6.511382355145371, CurrSamplesPerSec=6.153841246009542, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-23 23:57:57,021] [INFO] [logging.py:96:log_dist] [Rank 0] step=2300, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:57:57,022] [INFO] [timer.py:259:stop] epoch=1/micro_step=1104/global_step=2300, RunningAvgSamplesPerSec=6.509978642426532, CurrSamplesPerSec=6.233860140534824, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-23 23:58:46,889] [INFO] [logging.py:96:log_dist] [Rank 0] step=2310, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:58:46,890] [INFO] [timer.py:259:stop] epoch=1/micro_step=1114/global_step=2310, RunningAvgSamplesPerSec=6.511081095074621, CurrSamplesPerSec=7.3398469922681135, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-23 23:59:38,217] [INFO] [logging.py:96:log_dist] [Rank 0] step=2320, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-23 23:59:38,218] [INFO] [timer.py:259:stop] epoch=1/micro_step=1124/global_step=2320, RunningAvgSamplesPerSec=6.511391008885339, CurrSamplesPerSec=6.48744196823797, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:00:29,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=2330, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:00:29,447] [INFO] [timer.py:259:stop] epoch=1/micro_step=1134/global_step=2330, RunningAvgSamplesPerSec=6.511877895575802, CurrSamplesPerSec=7.03673648819557, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:01:19,726] [INFO] [logging.py:96:log_dist] [Rank 0] step=2340, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:01:19,727] [INFO] [timer.py:259:stop] epoch=1/micro_step=1144/global_step=2340, RunningAvgSamplesPerSec=6.512531230060301, CurrSamplesPerSec=6.804347448270118, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:02:10,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=2350, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:02:10,714] [INFO] [timer.py:259:stop] epoch=1/micro_step=1154/global_step=2350, RunningAvgSamplesPerSec=6.513020072948631, CurrSamplesPerSec=6.794553101976678, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:03:00,597] [INFO] [logging.py:96:log_dist] [Rank 0] step=2360, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:03:00,598] [INFO] [timer.py:259:stop] epoch=1/micro_step=1164/global_step=2360, RunningAvgSamplesPerSec=6.514017739380342, CurrSamplesPerSec=7.270325518459403, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:03:52,005] [INFO] [logging.py:96:log_dist] [Rank 0] step=2370, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:03:52,006] [INFO] [timer.py:259:stop] epoch=1/micro_step=1174/global_step=2370, RunningAvgSamplesPerSec=6.514071133671689, CurrSamplesPerSec=6.499374275104141, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:04:44,179] [INFO] [logging.py:96:log_dist] [Rank 0] step=2380, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:04:44,180] [INFO] [timer.py:259:stop] epoch=1/micro_step=1184/global_step=2380, RunningAvgSamplesPerSec=6.513877397523225, CurrSamplesPerSec=7.420537565084771, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:05:38,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=2390, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:05:38,586] [INFO] [timer.py:259:stop] epoch=1/micro_step=1194/global_step=2390, RunningAvgSamplesPerSec=6.512496295440422, CurrSamplesPerSec=6.129569325338186, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +Saving checkpoint at step 2392 ... +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-24 00:05:58,560] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2392 is about to be saved! +[2024-10-24 00:05:58,561] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2392.bin, tag: global_step2392 +[2024-10-24 00:05:58,561] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2392.bin... +[2024-10-24 00:06:27,202] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2392.bin. +[2024-10-24 00:06:27,203] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2392 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-24 00:06:40,244] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2392 is about to be saved! +[2024-10-24 00:06:40,245] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2392.bin, tag: global_step2392 +[2024-10-24 00:06:40,245] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2392.bin... +[2024-10-24 00:07:18,015] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2392.bin. +[2024-10-24 00:07:18,018] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2392 is ready now! +Model saved! +Checkpoint saved. + +***** Evaluating at epoch 2/5.0 ***** + +***** Evaluating at the beginning ***** +Evaluation: accuracy = 0.679745, reward_mean = 9.938471, reward_std = 3.178873 +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ +┃ prompt  ┃ higher-reward response  ┃ reward  ┃ lower-reward response  ┃ reward  ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ +│ USER: │ There are two bags visible in the image. │ 12.625000 │ There are four bags visible in the │ 12.625000 │ +│ How many bags are visible in the image? │ │ │ image. │ │ +│ │ │ │ │ │ +├──────────────────────────────────────────┼──────────────────────────────────────────┼───────────┼─────────────────────────────────────────┼───────────┤ +│ USER: │ Based on the signs and advertisements │ 11.750000 │ Based on the presence of multiple │ 8.437500 │ +│ What can you infer about the purpose of │ visible in the image, it can be inferred │ │ signs, including a large red sign that │ │ +│ the building with the pyramid-shaped │ that the building with the │ │ reads "Sale," it is likely that this │ │ +│ structure based on the signs and │ pyramid-shaped structure is likely a │ │ building serves as an outlet for retail │ │ +│ advertisements? │ shopping center or a commercial complex. │ │ or commercial purposes. The pyramid │ │ +│ │ The signs include names of various │ │ shape suggests that there might be some │ │ +│ │ brands and stores, such as "Ochsner," │ │ architectural significance to the │ │ +│ │ "Manor," and "Traita fina," which are │ │ design, but without more context, we │ │ +│ │ typically associated with retail and │ │ cannot definitively determine its │ │ +│ │ shopping. Additionally, the large text │ │ specific function. However, given the │ │ +│ │ "Shopping Center St. Jakob-Park" further │ │ promotional nature of the signs, one │ │ +│ │ supports the idea that this building is │ │ could reasonably assume that the │ │ +│ │ a shopping center. The prominent display │ │ building houses shops or businesses │ │ +│ │ of the word "BASEL" suggests that this │ │ offering discounts or special deals to │ │ +│ │ shopping center is located in or near │ │ attract customers. │ │ +│ │ Basel, Switzerland. │ │ │ │ +├──────────────────────────────────────────┼──────────────────────────────────────────┼───────────┼─────────────────────────────────────────┼───────────┤ +│ USER: │ Based on the lighting and shadows in the │ 11.937500 │ The image shows a plane at an airport │ 6.062500 │ +│ Based on the lighting and shadows, can │ image, it appears that the sun is │ │ terminal with lights in the distance. │ │ +│ you infer if the sun is rising or │ setting. The sky has a gradient from a │ │ The sky has hues of yellow and orange │ │ +│ setting? │ darker blue at the top to a lighter │ │ near the horizon which suggests that │ │ +│ │ yellow-orange near the horizon, which is │ │ it's either sunrise or sunset. However, │ │ +│ │ typical of sunset. Additionally, the │ │ there are no direct sunlight rays │ │ +│ │ lights at the airport are on, which is │ │ visible, so we cannot determine whether │ │ +│ │ more common during the evening as it │ │ this photo was taken during dawn │ │ +│ │ gets darker. │ │ (sunrise) or dusk (sunset). It could be │ │ +│ │ │ │ either one depending on where the │ │ +│ │ │ │ photographer took the picture from. If │ │ +│ │ │ │ they were standing facing east when │ │ +│ │ │ │ taking the shot, then it would likely │ │ +│ │ │ │ be morning; but if they faced west │ │ +│ │ │ │ while capturing the scene, it might │ │ +│ │ │ │ have been evening instead. │ │ +│ │ │ │ │ │ +│ │ │ │ It appears to be early morning based │ │ +│ │ │ │ off how dark everything else looks │ │ +│ │ │ │ except for those areas around the │ │ +│ │ │ │ horizon line indicating light coming up │ │ +│ │ │ │ over mountains behind them - suggesting │ │ +│ │ │ │ daytime hours ahead rather than │ │ +│ │ │ │ nighttime ones since all other parts │ │ +│ │ │ │ remain shadowed by darkness still │ │ +│ │ │ │ lingering outside city limits below us │ │ +│ │ │ │ here today... │ │ +└──────────────────────────────────────────┴──────────────────────────────────────────┴───────────┴─────────────────────────────────────────┴───────────┘ +[2024-10-24 00:28:01,097] [WARNING] [stage3.py:2102:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time +[2024-10-24 00:28:37,014] [INFO] [logging.py:96:log_dist] [Rank 0] step=2400, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:28:37,014] [INFO] [timer.py:259:stop] epoch=2/micro_step=8/global_step=2400, RunningAvgSamplesPerSec=6.514480696915276, CurrSamplesPerSec=7.032471403500234, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:29:29,463] [INFO] [logging.py:96:log_dist] [Rank 0] step=2410, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:29:29,464] [INFO] [timer.py:259:stop] epoch=2/micro_step=18/global_step=2410, RunningAvgSamplesPerSec=6.513923517165513, CurrSamplesPerSec=5.023736152587171, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:30:20,760] [INFO] [logging.py:96:log_dist] [Rank 0] step=2420, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:30:20,761] [INFO] [timer.py:259:stop] epoch=2/micro_step=28/global_step=2420, RunningAvgSamplesPerSec=6.514108439629836, CurrSamplesPerSec=6.85390225676849, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:31:15,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=2430, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:31:15,011] [INFO] [timer.py:259:stop] epoch=2/micro_step=38/global_step=2430, RunningAvgSamplesPerSec=6.512608769622537, CurrSamplesPerSec=6.113955771419362, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:32:08,582] [INFO] [logging.py:96:log_dist] [Rank 0] step=2440, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:32:08,583] [INFO] [timer.py:259:stop] epoch=2/micro_step=48/global_step=2440, RunningAvgSamplesPerSec=6.511620062419396, CurrSamplesPerSec=6.481663600143004, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 00:33:00,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=2450, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:33:00,317] [INFO] [timer.py:259:stop] epoch=2/micro_step=58/global_step=2450, RunningAvgSamplesPerSec=6.511649372125508, CurrSamplesPerSec=7.019091113035979, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:33:51,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=2460, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:33:51,568] [INFO] [timer.py:259:stop] epoch=2/micro_step=68/global_step=2460, RunningAvgSamplesPerSec=6.511897822817803, CurrSamplesPerSec=6.330185799430338, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:34:42,950] [INFO] [logging.py:96:log_dist] [Rank 0] step=2470, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:34:42,951] [INFO] [timer.py:259:stop] epoch=2/micro_step=78/global_step=2470, RunningAvgSamplesPerSec=6.512037800219525, CurrSamplesPerSec=6.111615267006509, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:35:35,541] [INFO] [logging.py:96:log_dist] [Rank 0] step=2480, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:35:35,542] [INFO] [timer.py:259:stop] epoch=2/micro_step=88/global_step=2480, RunningAvgSamplesPerSec=6.511632744421063, CurrSamplesPerSec=6.493986537169507, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:36:25,374] [INFO] [logging.py:96:log_dist] [Rank 0] step=2490, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:36:25,374] [INFO] [timer.py:259:stop] epoch=2/micro_step=98/global_step=2490, RunningAvgSamplesPerSec=6.512581160507826, CurrSamplesPerSec=6.400592036480433, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:37:14,283] [INFO] [logging.py:96:log_dist] [Rank 0] step=2500, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:37:14,284] [INFO] [timer.py:259:stop] epoch=2/micro_step=108/global_step=2500, RunningAvgSamplesPerSec=6.51410192303995, CurrSamplesPerSec=7.397041032213074, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:38:08,126] [INFO] [logging.py:96:log_dist] [Rank 0] step=2510, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:38:08,127] [INFO] [timer.py:259:stop] epoch=2/micro_step=118/global_step=2510, RunningAvgSamplesPerSec=6.513032020453146, CurrSamplesPerSec=6.281307705028912, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:39:01,393] [INFO] [logging.py:96:log_dist] [Rank 0] step=2520, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:39:01,394] [INFO] [timer.py:259:stop] epoch=2/micro_step=128/global_step=2520, RunningAvgSamplesPerSec=6.5123588739848115, CurrSamplesPerSec=5.832894049549642, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:39:53,494] [INFO] [logging.py:96:log_dist] [Rank 0] step=2530, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:39:53,494] [INFO] [timer.py:259:stop] epoch=2/micro_step=138/global_step=2530, RunningAvgSamplesPerSec=6.512127098294134, CurrSamplesPerSec=7.3100904201728465, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:40:46,068] [INFO] [logging.py:96:log_dist] [Rank 0] step=2540, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:40:46,069] [INFO] [timer.py:259:stop] epoch=2/micro_step=148/global_step=2540, RunningAvgSamplesPerSec=6.511886039112583, CurrSamplesPerSec=6.919861656900477, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:41:36,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=2550, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:41:36,763] [INFO] [timer.py:259:stop] epoch=2/micro_step=158/global_step=2550, RunningAvgSamplesPerSec=6.512319107986227, CurrSamplesPerSec=5.7242795701450255, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:42:27,489] [INFO] [logging.py:96:log_dist] [Rank 0] step=2560, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:42:27,489] [INFO] [timer.py:259:stop] epoch=2/micro_step=168/global_step=2560, RunningAvgSamplesPerSec=6.5127481072790765, CurrSamplesPerSec=6.600344427440011, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:43:18,107] [INFO] [logging.py:96:log_dist] [Rank 0] step=2570, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:43:18,107] [INFO] [timer.py:259:stop] epoch=2/micro_step=178/global_step=2570, RunningAvgSamplesPerSec=6.513336169928995, CurrSamplesPerSec=6.755300747963576, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:44:10,685] [INFO] [logging.py:96:log_dist] [Rank 0] step=2580, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:44:10,686] [INFO] [timer.py:259:stop] epoch=2/micro_step=188/global_step=2580, RunningAvgSamplesPerSec=6.512887520996421, CurrSamplesPerSec=6.4411323583678985, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 00:45:02,462] [INFO] [logging.py:96:log_dist] [Rank 0] step=2590, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:45:02,462] [INFO] [timer.py:259:stop] epoch=2/micro_step=198/global_step=2590, RunningAvgSamplesPerSec=6.512921555301007, CurrSamplesPerSec=5.587436892650525, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:45:54,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=2600, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:45:54,718] [INFO] [timer.py:259:stop] epoch=2/micro_step=208/global_step=2600, RunningAvgSamplesPerSec=6.5126886700227375, CurrSamplesPerSec=6.530375016427028, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:46:45,997] [INFO] [logging.py:96:log_dist] [Rank 0] step=2610, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:46:45,998] [INFO] [timer.py:259:stop] epoch=2/micro_step=218/global_step=2610, RunningAvgSamplesPerSec=6.512921994236289, CurrSamplesPerSec=6.434181776518738, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:47:39,944] [INFO] [logging.py:96:log_dist] [Rank 0] step=2620, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:47:39,945] [INFO] [timer.py:259:stop] epoch=2/micro_step=228/global_step=2620, RunningAvgSamplesPerSec=6.51165281173687, CurrSamplesPerSec=6.855043441191911, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:48:32,701] [INFO] [logging.py:96:log_dist] [Rank 0] step=2630, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:48:32,702] [INFO] [timer.py:259:stop] epoch=2/micro_step=238/global_step=2630, RunningAvgSamplesPerSec=6.511292853320564, CurrSamplesPerSec=6.3681233712970196, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:49:21,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=2640, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:49:21,162] [INFO] [timer.py:259:stop] epoch=2/micro_step=248/global_step=2640, RunningAvgSamplesPerSec=6.513153383121161, CurrSamplesPerSec=7.592347207775469, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:50:13,198] [INFO] [logging.py:96:log_dist] [Rank 0] step=2650, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:50:13,199] [INFO] [timer.py:259:stop] epoch=2/micro_step=258/global_step=2650, RunningAvgSamplesPerSec=6.5128682258607045, CurrSamplesPerSec=6.666119013087082, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:51:05,098] [INFO] [logging.py:96:log_dist] [Rank 0] step=2660, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:51:05,098] [INFO] [timer.py:259:stop] epoch=2/micro_step=268/global_step=2660, RunningAvgSamplesPerSec=6.512818146943022, CurrSamplesPerSec=6.6244014623508924, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:51:57,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=2670, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:51:57,403] [INFO] [timer.py:259:stop] epoch=2/micro_step=278/global_step=2670, RunningAvgSamplesPerSec=6.512514697994173, CurrSamplesPerSec=6.4459506741240595, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:52:48,312] [INFO] [logging.py:96:log_dist] [Rank 0] step=2680, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:52:48,313] [INFO] [timer.py:259:stop] epoch=2/micro_step=288/global_step=2680, RunningAvgSamplesPerSec=6.512987369391066, CurrSamplesPerSec=6.475853022652256, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:53:42,082] [INFO] [logging.py:96:log_dist] [Rank 0] step=2690, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:53:42,082] [INFO] [timer.py:259:stop] epoch=2/micro_step=298/global_step=2690, RunningAvgSamplesPerSec=6.511946294146578, CurrSamplesPerSec=5.9790614745084145, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 00:54:35,098] [INFO] [logging.py:96:log_dist] [Rank 0] step=2700, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:54:35,099] [INFO] [timer.py:259:stop] epoch=2/micro_step=308/global_step=2700, RunningAvgSamplesPerSec=6.511438556021343, CurrSamplesPerSec=6.531567379397074, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:55:26,342] [INFO] [logging.py:96:log_dist] [Rank 0] step=2710, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:55:26,343] [INFO] [timer.py:259:stop] epoch=2/micro_step=318/global_step=2710, RunningAvgSamplesPerSec=6.511536909181053, CurrSamplesPerSec=7.278676207833538, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:56:16,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=2720, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:56:16,870] [INFO] [timer.py:259:stop] epoch=2/micro_step=328/global_step=2720, RunningAvgSamplesPerSec=6.512082013042047, CurrSamplesPerSec=6.223054269959928, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:57:09,015] [INFO] [logging.py:96:log_dist] [Rank 0] step=2730, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:57:09,015] [INFO] [timer.py:259:stop] epoch=2/micro_step=338/global_step=2730, RunningAvgSamplesPerSec=6.511816087263492, CurrSamplesPerSec=5.753948345541845, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 00:58:00,942] [INFO] [logging.py:96:log_dist] [Rank 0] step=2740, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:58:00,943] [INFO] [timer.py:259:stop] epoch=2/micro_step=348/global_step=2740, RunningAvgSamplesPerSec=6.511793198362006, CurrSamplesPerSec=6.497991975963034, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:58:52,301] [INFO] [logging.py:96:log_dist] [Rank 0] step=2750, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:58:52,302] [INFO] [timer.py:259:stop] epoch=2/micro_step=358/global_step=2750, RunningAvgSamplesPerSec=6.51175075995915, CurrSamplesPerSec=6.535665153733121, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 00:59:42,145] [INFO] [logging.py:96:log_dist] [Rank 0] step=2760, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 00:59:42,145] [INFO] [timer.py:259:stop] epoch=2/micro_step=368/global_step=2760, RunningAvgSamplesPerSec=6.512683393311769, CurrSamplesPerSec=6.622301791291445, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 01:00:32,753] [INFO] [logging.py:96:log_dist] [Rank 0] step=2770, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:00:32,754] [INFO] [timer.py:259:stop] epoch=2/micro_step=378/global_step=2770, RunningAvgSamplesPerSec=6.51303324422141, CurrSamplesPerSec=6.221002891190038, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:01:23,781] [INFO] [logging.py:96:log_dist] [Rank 0] step=2780, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:01:23,782] [INFO] [timer.py:259:stop] epoch=2/micro_step=388/global_step=2780, RunningAvgSamplesPerSec=6.51327213587538, CurrSamplesPerSec=7.413264199114365, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:02:14,513] [INFO] [logging.py:96:log_dist] [Rank 0] step=2790, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:02:14,514] [INFO] [timer.py:259:stop] epoch=2/micro_step=398/global_step=2790, RunningAvgSamplesPerSec=6.513722308950218, CurrSamplesPerSec=6.45613965592677, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:03:09,833] [INFO] [logging.py:96:log_dist] [Rank 0] step=2800, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:03:09,834] [INFO] [timer.py:259:stop] epoch=2/micro_step=408/global_step=2800, RunningAvgSamplesPerSec=6.511895761999357, CurrSamplesPerSec=6.019014201808178, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:04:00,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=2810, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:04:00,020] [INFO] [timer.py:259:stop] epoch=2/micro_step=418/global_step=2810, RunningAvgSamplesPerSec=6.512712577817583, CurrSamplesPerSec=6.140647962651469, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:04:50,490] [INFO] [logging.py:96:log_dist] [Rank 0] step=2820, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:04:50,491] [INFO] [timer.py:259:stop] epoch=2/micro_step=428/global_step=2820, RunningAvgSamplesPerSec=6.513190233735134, CurrSamplesPerSec=6.602778385075999, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:05:41,365] [INFO] [logging.py:96:log_dist] [Rank 0] step=2830, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:05:41,366] [INFO] [timer.py:259:stop] epoch=2/micro_step=438/global_step=2830, RunningAvgSamplesPerSec=6.513479592629064, CurrSamplesPerSec=6.280045982382818, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:06:33,393] [INFO] [logging.py:96:log_dist] [Rank 0] step=2840, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:06:33,394] [INFO] [timer.py:259:stop] epoch=2/micro_step=448/global_step=2840, RunningAvgSamplesPerSec=6.513265351931263, CurrSamplesPerSec=6.321112032130304, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:07:24,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=2850, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:07:24,022] [INFO] [timer.py:259:stop] epoch=2/micro_step=458/global_step=2850, RunningAvgSamplesPerSec=6.5136731421059855, CurrSamplesPerSec=7.2297339720532685, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:08:14,930] [INFO] [logging.py:96:log_dist] [Rank 0] step=2860, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:08:14,931] [INFO] [timer.py:259:stop] epoch=2/micro_step=468/global_step=2860, RunningAvgSamplesPerSec=6.5140808110101975, CurrSamplesPerSec=6.415832161118899, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:09:05,773] [INFO] [logging.py:96:log_dist] [Rank 0] step=2870, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:09:05,774] [INFO] [timer.py:259:stop] epoch=2/micro_step=478/global_step=2870, RunningAvgSamplesPerSec=6.514312503789849, CurrSamplesPerSec=5.997242385087294, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:09:57,449] [INFO] [logging.py:96:log_dist] [Rank 0] step=2880, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:09:57,449] [INFO] [timer.py:259:stop] epoch=2/micro_step=488/global_step=2880, RunningAvgSamplesPerSec=6.514304832402291, CurrSamplesPerSec=6.315099774712027, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 01:10:49,017] [INFO] [logging.py:96:log_dist] [Rank 0] step=2890, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:10:49,018] [INFO] [timer.py:259:stop] epoch=2/micro_step=498/global_step=2890, RunningAvgSamplesPerSec=6.514425036529521, CurrSamplesPerSec=5.942168240384165, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 01:11:40,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=2900, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:11:40,011] [INFO] [timer.py:259:stop] epoch=2/micro_step=508/global_step=2900, RunningAvgSamplesPerSec=6.514747239150708, CurrSamplesPerSec=6.783150286520267, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:12:32,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=2910, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:12:32,558] [INFO] [timer.py:259:stop] epoch=2/micro_step=518/global_step=2910, RunningAvgSamplesPerSec=6.514411984875074, CurrSamplesPerSec=6.256363997355868, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:13:26,325] [INFO] [logging.py:96:log_dist] [Rank 0] step=2920, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:13:26,325] [INFO] [timer.py:259:stop] epoch=2/micro_step=528/global_step=2920, RunningAvgSamplesPerSec=6.513443541500288, CurrSamplesPerSec=6.003235720740743, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 01:14:16,513] [INFO] [logging.py:96:log_dist] [Rank 0] step=2930, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:14:16,514] [INFO] [timer.py:259:stop] epoch=2/micro_step=538/global_step=2930, RunningAvgSamplesPerSec=6.514119618662796, CurrSamplesPerSec=6.43760115575404, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:15:07,585] [INFO] [logging.py:96:log_dist] [Rank 0] step=2940, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:15:07,586] [INFO] [timer.py:259:stop] epoch=2/micro_step=548/global_step=2940, RunningAvgSamplesPerSec=6.514352830818419, CurrSamplesPerSec=5.978996485298047, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:15:59,230] [INFO] [logging.py:96:log_dist] [Rank 0] step=2950, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:15:59,231] [INFO] [timer.py:259:stop] epoch=2/micro_step=558/global_step=2950, RunningAvgSamplesPerSec=6.514359796968952, CurrSamplesPerSec=7.080022789404433, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:16:51,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=2960, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:16:51,850] [INFO] [timer.py:259:stop] epoch=2/micro_step=568/global_step=2960, RunningAvgSamplesPerSec=6.513994278461152, CurrSamplesPerSec=6.72393513940391, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:17:43,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=2970, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:17:43,938] [INFO] [timer.py:259:stop] epoch=2/micro_step=578/global_step=2970, RunningAvgSamplesPerSec=6.51386691818992, CurrSamplesPerSec=6.88345489142667, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:18:35,525] [INFO] [logging.py:96:log_dist] [Rank 0] step=2980, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:18:35,526] [INFO] [timer.py:259:stop] epoch=2/micro_step=588/global_step=2980, RunningAvgSamplesPerSec=6.513870382225099, CurrSamplesPerSec=7.071289820263867, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:19:30,285] [INFO] [logging.py:96:log_dist] [Rank 0] step=2990, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:19:30,286] [INFO] [timer.py:259:stop] epoch=2/micro_step=598/global_step=2990, RunningAvgSamplesPerSec=6.512574039717603, CurrSamplesPerSec=6.458265781680316, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +Saving checkpoint at step 2990 ... +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-24 01:19:43,307] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2990 is about to be saved! +[2024-10-24 01:19:43,308] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2990.bin, tag: global_step2990 +[2024-10-24 01:19:43,308] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2990.bin... +[2024-10-24 01:20:12,131] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2990.bin. +[2024-10-24 01:20:12,133] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2990 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-24 01:20:25,118] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step2990 is about to be saved! +[2024-10-24 01:20:25,118] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2990.bin, tag: global_step2990 +[2024-10-24 01:20:25,119] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2990.bin... +[2024-10-24 01:20:56,283] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_2990.bin. +[2024-10-24 01:20:56,285] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step2990 is ready now! +Model saved! +Checkpoint saved. +[2024-10-24 01:21:45,345] [INFO] [logging.py:96:log_dist] [Rank 0] step=3000, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:21:45,346] [INFO] [timer.py:259:stop] epoch=2/micro_step=608/global_step=3000, RunningAvgSamplesPerSec=6.513956130827349, CurrSamplesPerSec=7.002147300997673, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:22:34,653] [INFO] [logging.py:96:log_dist] [Rank 0] step=3010, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:22:34,654] [INFO] [timer.py:259:stop] epoch=2/micro_step=618/global_step=3010, RunningAvgSamplesPerSec=6.515036726221774, CurrSamplesPerSec=7.603365145001198, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:23:25,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=3020, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:23:25,802] [INFO] [timer.py:259:stop] epoch=2/micro_step=628/global_step=3020, RunningAvgSamplesPerSec=6.515390106306801, CurrSamplesPerSec=7.109449536890363, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:24:19,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=3030, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:24:19,470] [INFO] [timer.py:259:stop] epoch=2/micro_step=638/global_step=3030, RunningAvgSamplesPerSec=6.514402615132137, CurrSamplesPerSec=6.515988241404288, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 01:25:10,670] [INFO] [logging.py:96:log_dist] [Rank 0] step=3040, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:25:10,671] [INFO] [timer.py:259:stop] epoch=2/micro_step=648/global_step=3040, RunningAvgSamplesPerSec=6.514761486052804, CurrSamplesPerSec=7.130705929237001, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:26:03,458] [INFO] [logging.py:96:log_dist] [Rank 0] step=3050, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:26:03,459] [INFO] [timer.py:259:stop] epoch=2/micro_step=658/global_step=3050, RunningAvgSamplesPerSec=6.514179615242363, CurrSamplesPerSec=7.042337096060046, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:26:55,035] [INFO] [logging.py:96:log_dist] [Rank 0] step=3060, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:26:55,036] [INFO] [timer.py:259:stop] epoch=2/micro_step=668/global_step=3060, RunningAvgSamplesPerSec=6.5142111575367485, CurrSamplesPerSec=6.102415415042262, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:27:47,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=3070, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:27:47,802] [INFO] [timer.py:259:stop] epoch=2/micro_step=678/global_step=3070, RunningAvgSamplesPerSec=6.513684688310297, CurrSamplesPerSec=6.417828090862259, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:28:40,112] [INFO] [logging.py:96:log_dist] [Rank 0] step=3080, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:28:40,113] [INFO] [timer.py:259:stop] epoch=2/micro_step=688/global_step=3080, RunningAvgSamplesPerSec=6.513462080789861, CurrSamplesPerSec=8.235797818829722, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:29:31,350] [INFO] [logging.py:96:log_dist] [Rank 0] step=3090, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:29:31,351] [INFO] [timer.py:259:stop] epoch=2/micro_step=698/global_step=3090, RunningAvgSamplesPerSec=6.513771930907221, CurrSamplesPerSec=6.764508593905333, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:30:20,873] [INFO] [logging.py:96:log_dist] [Rank 0] step=3100, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:30:20,874] [INFO] [timer.py:259:stop] epoch=2/micro_step=708/global_step=3100, RunningAvgSamplesPerSec=6.514770734911659, CurrSamplesPerSec=7.144534193008662, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:31:12,531] [INFO] [logging.py:96:log_dist] [Rank 0] step=3110, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:31:12,532] [INFO] [timer.py:259:stop] epoch=2/micro_step=718/global_step=3110, RunningAvgSamplesPerSec=6.514831660576769, CurrSamplesPerSec=6.842558717304063, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:32:02,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=3120, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:32:02,530] [INFO] [timer.py:259:stop] epoch=2/micro_step=728/global_step=3120, RunningAvgSamplesPerSec=6.515702268895799, CurrSamplesPerSec=5.918780400574305, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:32:56,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=3130, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:32:56,027] [INFO] [timer.py:259:stop] epoch=2/micro_step=738/global_step=3130, RunningAvgSamplesPerSec=6.51482132808384, CurrSamplesPerSec=6.412960704824894, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:33:48,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=3140, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:33:48,470] [INFO] [timer.py:259:stop] epoch=2/micro_step=748/global_step=3140, RunningAvgSamplesPerSec=6.514496796999612, CurrSamplesPerSec=6.469531410533289, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:34:39,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=3150, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:34:39,262] [INFO] [timer.py:259:stop] epoch=2/micro_step=758/global_step=3150, RunningAvgSamplesPerSec=6.514796999961643, CurrSamplesPerSec=5.633418451860876, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:35:30,740] [INFO] [logging.py:96:log_dist] [Rank 0] step=3160, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:35:30,741] [INFO] [timer.py:259:stop] epoch=2/micro_step=768/global_step=3160, RunningAvgSamplesPerSec=6.514831309826048, CurrSamplesPerSec=6.487594994917642, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:36:21,913] [INFO] [logging.py:96:log_dist] [Rank 0] step=3170, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:36:21,914] [INFO] [timer.py:259:stop] epoch=2/micro_step=778/global_step=3170, RunningAvgSamplesPerSec=6.514953058215115, CurrSamplesPerSec=6.326081253900177, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:37:14,532] [INFO] [logging.py:96:log_dist] [Rank 0] step=3180, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:37:14,532] [INFO] [timer.py:259:stop] epoch=2/micro_step=788/global_step=3180, RunningAvgSamplesPerSec=6.514500522260303, CurrSamplesPerSec=5.955604341798829, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:38:07,781] [INFO] [logging.py:96:log_dist] [Rank 0] step=3190, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:38:07,782] [INFO] [timer.py:259:stop] epoch=2/micro_step=798/global_step=3190, RunningAvgSamplesPerSec=6.51384764622324, CurrSamplesPerSec=5.852038024107369, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:38:59,769] [INFO] [logging.py:96:log_dist] [Rank 0] step=3200, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:38:59,770] [INFO] [timer.py:259:stop] epoch=2/micro_step=808/global_step=3200, RunningAvgSamplesPerSec=6.513735359465741, CurrSamplesPerSec=6.274666210612316, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 01:39:50,740] [INFO] [logging.py:96:log_dist] [Rank 0] step=3210, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:39:50,741] [INFO] [timer.py:259:stop] epoch=2/micro_step=818/global_step=3210, RunningAvgSamplesPerSec=6.514187551831477, CurrSamplesPerSec=6.26736321942588, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 01:40:41,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=3220, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:40:41,422] [INFO] [timer.py:259:stop] epoch=2/micro_step=828/global_step=3220, RunningAvgSamplesPerSec=6.5147189382038295, CurrSamplesPerSec=6.638205946244263, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:41:34,490] [INFO] [logging.py:96:log_dist] [Rank 0] step=3230, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:41:34,490] [INFO] [timer.py:259:stop] epoch=2/micro_step=838/global_step=3230, RunningAvgSamplesPerSec=6.514278662562831, CurrSamplesPerSec=6.104783030408686, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:42:27,021] [INFO] [logging.py:96:log_dist] [Rank 0] step=3240, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:42:27,022] [INFO] [timer.py:259:stop] epoch=2/micro_step=848/global_step=3240, RunningAvgSamplesPerSec=6.513846054668941, CurrSamplesPerSec=6.937363142896987, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:43:18,457] [INFO] [logging.py:96:log_dist] [Rank 0] step=3250, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:43:18,458] [INFO] [timer.py:259:stop] epoch=2/micro_step=858/global_step=3250, RunningAvgSamplesPerSec=6.513927947716509, CurrSamplesPerSec=6.5394370117531695, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 01:44:09,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=3260, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:44:09,103] [INFO] [timer.py:259:stop] epoch=2/micro_step=868/global_step=3260, RunningAvgSamplesPerSec=6.514364279409986, CurrSamplesPerSec=7.3143131401283465, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:45:01,782] [INFO] [logging.py:96:log_dist] [Rank 0] step=3270, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:45:01,782] [INFO] [timer.py:259:stop] epoch=2/micro_step=878/global_step=3270, RunningAvgSamplesPerSec=6.51379793260079, CurrSamplesPerSec=6.794736783261613, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:45:53,878] [INFO] [logging.py:96:log_dist] [Rank 0] step=3280, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:45:53,878] [INFO] [timer.py:259:stop] epoch=2/micro_step=888/global_step=3280, RunningAvgSamplesPerSec=6.513674533482559, CurrSamplesPerSec=6.681799792408644, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:46:46,781] [INFO] [logging.py:96:log_dist] [Rank 0] step=3290, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:46:46,782] [INFO] [timer.py:259:stop] epoch=2/micro_step=898/global_step=3290, RunningAvgSamplesPerSec=6.513050812213571, CurrSamplesPerSec=6.9927114852406165, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:47:37,978] [INFO] [logging.py:96:log_dist] [Rank 0] step=3300, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:47:37,978] [INFO] [timer.py:259:stop] epoch=2/micro_step=908/global_step=3300, RunningAvgSamplesPerSec=6.513248072552907, CurrSamplesPerSec=6.055825296512563, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:48:29,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=3310, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:48:29,830] [INFO] [timer.py:259:stop] epoch=2/micro_step=918/global_step=3310, RunningAvgSamplesPerSec=6.513203627838249, CurrSamplesPerSec=5.8508352146889795, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:49:21,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=3320, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:49:21,678] [INFO] [timer.py:259:stop] epoch=2/micro_step=928/global_step=3320, RunningAvgSamplesPerSec=6.513067733040919, CurrSamplesPerSec=6.733000538205261, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:50:14,034] [INFO] [logging.py:96:log_dist] [Rank 0] step=3330, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:50:14,035] [INFO] [timer.py:259:stop] epoch=2/micro_step=938/global_step=3330, RunningAvgSamplesPerSec=6.512838121902733, CurrSamplesPerSec=5.964853204485455, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:51:04,473] [INFO] [logging.py:96:log_dist] [Rank 0] step=3340, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:51:04,474] [INFO] [timer.py:259:stop] epoch=2/micro_step=948/global_step=3340, RunningAvgSamplesPerSec=6.513244105022571, CurrSamplesPerSec=6.597745556008919, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 01:51:57,093] [INFO] [logging.py:96:log_dist] [Rank 0] step=3350, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:51:57,093] [INFO] [timer.py:259:stop] epoch=2/micro_step=958/global_step=3350, RunningAvgSamplesPerSec=6.512849221369867, CurrSamplesPerSec=7.14830048759439, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:52:47,513] [INFO] [logging.py:96:log_dist] [Rank 0] step=3360, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:52:47,514] [INFO] [timer.py:259:stop] epoch=2/micro_step=968/global_step=3360, RunningAvgSamplesPerSec=6.513205809078882, CurrSamplesPerSec=7.098299823193807, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:53:38,490] [INFO] [logging.py:96:log_dist] [Rank 0] step=3370, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:53:38,490] [INFO] [timer.py:259:stop] epoch=2/micro_step=978/global_step=3370, RunningAvgSamplesPerSec=6.513487405619041, CurrSamplesPerSec=7.111327681629262, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:54:30,237] [INFO] [logging.py:96:log_dist] [Rank 0] step=3380, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:54:30,238] [INFO] [timer.py:259:stop] epoch=2/micro_step=988/global_step=3380, RunningAvgSamplesPerSec=6.51355180875649, CurrSamplesPerSec=6.375973656983662, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:55:20,981] [INFO] [logging.py:96:log_dist] [Rank 0] step=3390, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:55:20,982] [INFO] [timer.py:259:stop] epoch=2/micro_step=998/global_step=3390, RunningAvgSamplesPerSec=6.514051278673107, CurrSamplesPerSec=6.154673706163247, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:56:15,234] [INFO] [logging.py:96:log_dist] [Rank 0] step=3400, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:56:15,234] [INFO] [timer.py:259:stop] epoch=2/micro_step=1008/global_step=3400, RunningAvgSamplesPerSec=6.512939834787831, CurrSamplesPerSec=5.548210784237724, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:57:06,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=3410, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:57:06,135] [INFO] [timer.py:259:stop] epoch=2/micro_step=1018/global_step=3410, RunningAvgSamplesPerSec=6.513208771390289, CurrSamplesPerSec=7.111552251491729, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:57:57,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=3420, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:57:57,080] [INFO] [timer.py:259:stop] epoch=2/micro_step=1028/global_step=3420, RunningAvgSamplesPerSec=6.51345163182096, CurrSamplesPerSec=6.0033968312296775, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 01:58:49,325] [INFO] [logging.py:96:log_dist] [Rank 0] step=3430, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:58:49,326] [INFO] [timer.py:259:stop] epoch=2/micro_step=1038/global_step=3430, RunningAvgSamplesPerSec=6.51322296874406, CurrSamplesPerSec=6.370113002472367, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 01:59:41,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=3440, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 01:59:41,627] [INFO] [timer.py:259:stop] epoch=2/micro_step=1048/global_step=3440, RunningAvgSamplesPerSec=6.512924053589059, CurrSamplesPerSec=6.362647807196469, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:00:33,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=3450, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:00:33,014] [INFO] [timer.py:259:stop] epoch=2/micro_step=1058/global_step=3450, RunningAvgSamplesPerSec=6.51301986573677, CurrSamplesPerSec=6.220050924988457, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:01:25,513] [INFO] [logging.py:96:log_dist] [Rank 0] step=3460, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:01:25,514] [INFO] [timer.py:259:stop] epoch=2/micro_step=1068/global_step=3460, RunningAvgSamplesPerSec=6.512597002820951, CurrSamplesPerSec=6.953701235132756, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:02:17,674] [INFO] [logging.py:96:log_dist] [Rank 0] step=3470, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:02:17,675] [INFO] [timer.py:259:stop] epoch=2/micro_step=1078/global_step=3470, RunningAvgSamplesPerSec=6.512518099644894, CurrSamplesPerSec=6.595618017637018, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:03:08,256] [INFO] [logging.py:96:log_dist] [Rank 0] step=3480, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:03:08,257] [INFO] [timer.py:259:stop] epoch=2/micro_step=1088/global_step=3480, RunningAvgSamplesPerSec=6.512918518630661, CurrSamplesPerSec=6.555846787830885, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:04:00,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=3490, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:04:00,779] [INFO] [timer.py:259:stop] epoch=2/micro_step=1098/global_step=3490, RunningAvgSamplesPerSec=6.512602201453771, CurrSamplesPerSec=6.259647154742363, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:04:53,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=3500, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:04:53,945] [INFO] [timer.py:259:stop] epoch=2/micro_step=1108/global_step=3500, RunningAvgSamplesPerSec=6.512049249340864, CurrSamplesPerSec=6.206371918076404, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:05:44,191] [INFO] [logging.py:96:log_dist] [Rank 0] step=3510, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:05:44,192] [INFO] [timer.py:259:stop] epoch=2/micro_step=1118/global_step=3510, RunningAvgSamplesPerSec=6.512687233519719, CurrSamplesPerSec=6.247942246939033, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:06:35,949] [INFO] [logging.py:96:log_dist] [Rank 0] step=3520, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:06:35,950] [INFO] [timer.py:259:stop] epoch=2/micro_step=1128/global_step=3520, RunningAvgSamplesPerSec=6.512622546116682, CurrSamplesPerSec=6.011137625137278, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:07:26,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=3530, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:07:26,027] [INFO] [timer.py:259:stop] epoch=2/micro_step=1138/global_step=3530, RunningAvgSamplesPerSec=6.513228742400738, CurrSamplesPerSec=6.75407832758376, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:08:15,785] [INFO] [logging.py:96:log_dist] [Rank 0] step=3540, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:08:15,786] [INFO] [timer.py:259:stop] epoch=2/micro_step=1148/global_step=3540, RunningAvgSamplesPerSec=6.513878954477393, CurrSamplesPerSec=7.1654996652741385, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:09:07,118] [INFO] [logging.py:96:log_dist] [Rank 0] step=3550, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:09:07,119] [INFO] [timer.py:259:stop] epoch=2/micro_step=1158/global_step=3550, RunningAvgSamplesPerSec=6.513936793196858, CurrSamplesPerSec=6.626498866531786, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:09:57,194] [INFO] [logging.py:96:log_dist] [Rank 0] step=3560, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:09:57,194] [INFO] [timer.py:259:stop] epoch=2/micro_step=1168/global_step=3560, RunningAvgSamplesPerSec=6.514643598906785, CurrSamplesPerSec=7.1745141996509, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:10:48,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=3570, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:10:48,858] [INFO] [timer.py:259:stop] epoch=2/micro_step=1178/global_step=3570, RunningAvgSamplesPerSec=6.514698529988271, CurrSamplesPerSec=5.601539298001074, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:11:41,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=3580, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:11:41,422] [INFO] [timer.py:259:stop] epoch=2/micro_step=1188/global_step=3580, RunningAvgSamplesPerSec=6.514317547739948, CurrSamplesPerSec=6.331001852958412, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +Saving checkpoint at step 3588 ... +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-24 02:12:34,471] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step3588 is about to be saved! +[2024-10-24 02:12:34,472] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_3588.bin, tag: global_step3588 +[2024-10-24 02:12:34,472] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_3588.bin... +[2024-10-24 02:13:05,546] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_3588.bin. +[2024-10-24 02:13:05,548] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step3588 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-24 02:13:18,334] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step3588 is about to be saved! +[2024-10-24 02:13:18,335] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_3588.bin, tag: global_step3588 +[2024-10-24 02:13:18,335] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_3588.bin... +[2024-10-24 02:13:55,306] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_3588.bin. +[2024-10-24 02:13:55,308] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step3588 is ready now! +Model saved! +Checkpoint saved. + +***** Evaluating at epoch 3/5.0 ***** + +***** Evaluating at the beginning ***** +Evaluation: accuracy = 0.675061, reward_mean = 9.768251, reward_std = 3.161652 +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ +┃ prompt  ┃ higher-reward response  ┃ reward  ┃ lower-reward response  ┃ reward  ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ +│ USER: │ There are two bags visible in the image. │ 10.000000 │ There are four bags visible in the │ 9.562500 │ +│ How many bags are visible in the image? │ │ │ image. │ │ +│ │ │ │ │ │ +├──────────────────────────────────────────┼──────────────────────────────────────────┼───────────┼─────────────────────────────────────────┼───────────┤ +│ USER: │ Based on the signs and advertisements │ 9.500000 │ Based on the presence of multiple │ 6.031250 │ +│ What can you infer about the purpose of │ visible in the image, it can be inferred │ │ signs, including a large red sign that │ │ +│ the building with the pyramid-shaped │ that the building with the │ │ reads "Sale," it is likely that this │ │ +│ structure based on the signs and │ pyramid-shaped structure is likely a │ │ building serves as an outlet for retail │ │ +│ advertisements? │ shopping center or a commercial complex. │ │ or commercial purposes. The pyramid │ │ +│ │ The signs include names of various │ │ shape suggests that there might be some │ │ +│ │ brands and stores, such as "Ochsner," │ │ architectural significance to the │ │ +│ │ "Manor," and "Traita fina," which are │ │ design, but without more context, we │ │ +│ │ typically associated with retail and │ │ cannot definitively determine its │ │ +│ │ shopping. Additionally, the large text │ │ specific function. However, given the │ │ +│ │ "Shopping Center St. Jakob-Park" further │ │ promotional nature of the signs, one │ │ +│ │ supports the idea that this building is │ │ could reasonably assume that the │ │ +│ │ a shopping center. The prominent display │ │ building houses shops or businesses │ │ +│ │ of the word "BASEL" suggests that this │ │ offering discounts or special deals to │ │ +│ │ shopping center is located in or near │ │ attract customers. │ │ +│ │ Basel, Switzerland. │ │ │ │ +├──────────────────────────────────────────┼──────────────────────────────────────────┼───────────┼─────────────────────────────────────────┼───────────┤ +│ USER: │ Based on the lighting and shadows in the │ 11.000000 │ The image shows a plane at an airport │ 10.625000 │ +│ Based on the lighting and shadows, can │ image, it appears that the sun is │ │ terminal with lights in the distance. │ │ +│ you infer if the sun is rising or │ setting. The sky has a gradient from a │ │ The sky has hues of yellow and orange │ │ +│ setting? │ darker blue at the top to a lighter │ │ near the horizon which suggests that │ │ +│ │ yellow-orange near the horizon, which is │ │ it's either sunrise or sunset. However, │ │ +│ │ typical of sunset. Additionally, the │ │ there are no direct sunlight rays │ │ +│ │ lights at the airport are on, which is │ │ visible, so we cannot determine whether │ │ +│ │ more common during the evening as it │ │ this photo was taken during dawn │ │ +│ │ gets darker. │ │ (sunrise) or dusk (sunset). It could be │ │ +│ │ │ │ either one depending on where the │ │ +│ │ │ │ photographer took the picture from. If │ │ +│ │ │ │ they were standing facing east when │ │ +│ │ │ │ taking the shot, then it would likely │ │ +│ │ │ │ be morning; but if they faced west │ │ +│ │ │ │ while capturing the scene, it might │ │ +│ │ │ │ have been evening instead. │ │ +│ │ │ │ │ │ +│ │ │ │ It appears to be early morning based │ │ +│ │ │ │ off how dark everything else looks │ │ +│ │ │ │ except for those areas around the │ │ +│ │ │ │ horizon line indicating light coming up │ │ +│ │ │ │ over mountains behind them - suggesting │ │ +│ │ │ │ daytime hours ahead rather than │ │ +│ │ │ │ nighttime ones since all other parts │ │ +│ │ │ │ remain shadowed by darkness still │ │ +│ │ │ │ lingering outside city limits below us │ │ +│ │ │ │ here today... │ │ +└──────────────────────────────────────────┴──────────────────────────────────────────┴───────────┴─────────────────────────────────────────┴───────────┘ +[2024-10-24 02:34:38,156] [WARNING] [stage3.py:2102:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time +[2024-10-24 02:34:43,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=3590, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:34:43,235] [INFO] [timer.py:259:stop] epoch=3/micro_step=2/global_step=3590, RunningAvgSamplesPerSec=6.514885529302918, CurrSamplesPerSec=6.547523337486056, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:35:34,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=3600, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:35:34,434] [INFO] [timer.py:259:stop] epoch=3/micro_step=12/global_step=3600, RunningAvgSamplesPerSec=6.5151468556156855, CurrSamplesPerSec=6.614120105075223, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:36:26,465] [INFO] [logging.py:96:log_dist] [Rank 0] step=3610, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:36:26,465] [INFO] [timer.py:259:stop] epoch=3/micro_step=22/global_step=3610, RunningAvgSamplesPerSec=6.514904701981346, CurrSamplesPerSec=7.511018195617106, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:37:19,413] [INFO] [logging.py:96:log_dist] [Rank 0] step=3620, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:37:19,414] [INFO] [timer.py:259:stop] epoch=3/micro_step=32/global_step=3620, RunningAvgSamplesPerSec=6.5144706309238165, CurrSamplesPerSec=6.38769758479095, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:38:13,435] [INFO] [logging.py:96:log_dist] [Rank 0] step=3630, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:38:13,436] [INFO] [timer.py:259:stop] epoch=3/micro_step=42/global_step=3630, RunningAvgSamplesPerSec=6.51357861024806, CurrSamplesPerSec=7.124015846122856, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:39:06,665] [INFO] [logging.py:96:log_dist] [Rank 0] step=3640, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:39:06,666] [INFO] [timer.py:259:stop] epoch=3/micro_step=52/global_step=3640, RunningAvgSamplesPerSec=6.513046605989794, CurrSamplesPerSec=6.970626152504918, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:39:56,325] [INFO] [logging.py:96:log_dist] [Rank 0] step=3650, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:39:56,326] [INFO] [timer.py:259:stop] epoch=3/micro_step=62/global_step=3650, RunningAvgSamplesPerSec=6.513760795240664, CurrSamplesPerSec=6.354439758981086, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:40:48,670] [INFO] [logging.py:96:log_dist] [Rank 0] step=3660, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:40:48,670] [INFO] [timer.py:259:stop] epoch=3/micro_step=72/global_step=3660, RunningAvgSamplesPerSec=6.513467572996743, CurrSamplesPerSec=5.57274834721296, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:41:41,059] [INFO] [logging.py:96:log_dist] [Rank 0] step=3670, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:41:41,059] [INFO] [timer.py:259:stop] epoch=3/micro_step=82/global_step=3670, RunningAvgSamplesPerSec=6.513246742052412, CurrSamplesPerSec=5.274453953915161, MemAllocated=25.43GB, MaxMemAllocated=45.92GB +[2024-10-24 02:42:32,213] [INFO] [logging.py:96:log_dist] [Rank 0] step=3680, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:42:32,214] [INFO] [timer.py:259:stop] epoch=3/micro_step=92/global_step=3680, RunningAvgSamplesPerSec=6.513482131030141, CurrSamplesPerSec=6.970445870736209, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:43:22,073] [INFO] [logging.py:96:log_dist] [Rank 0] step=3690, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:43:22,074] [INFO] [timer.py:259:stop] epoch=3/micro_step=102/global_step=3690, RunningAvgSamplesPerSec=6.514206399389043, CurrSamplesPerSec=6.6332799451738005, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:44:12,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=3700, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:44:12,141] [INFO] [timer.py:259:stop] epoch=3/micro_step=112/global_step=3700, RunningAvgSamplesPerSec=6.51485046153494, CurrSamplesPerSec=7.12012891229942, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:45:06,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=3710, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:45:06,567] [INFO] [timer.py:259:stop] epoch=3/micro_step=122/global_step=3710, RunningAvgSamplesPerSec=6.513876466709501, CurrSamplesPerSec=5.506305032397695, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 02:46:01,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=3720, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:46:01,362] [INFO] [timer.py:259:stop] epoch=3/micro_step=132/global_step=3720, RunningAvgSamplesPerSec=6.512883509648147, CurrSamplesPerSec=5.932951772940461, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 02:46:51,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=3730, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:46:51,568] [INFO] [timer.py:259:stop] epoch=3/micro_step=142/global_step=3730, RunningAvgSamplesPerSec=6.51344499555298, CurrSamplesPerSec=5.982535391587216, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:47:43,337] [INFO] [logging.py:96:log_dist] [Rank 0] step=3740, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:47:43,338] [INFO] [timer.py:259:stop] epoch=3/micro_step=152/global_step=3740, RunningAvgSamplesPerSec=6.513401091801878, CurrSamplesPerSec=7.327187019833651, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:48:34,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=3750, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:48:34,771] [INFO] [timer.py:259:stop] epoch=3/micro_step=162/global_step=3750, RunningAvgSamplesPerSec=6.513524708196465, CurrSamplesPerSec=6.900468067530899, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:49:25,563] [INFO] [logging.py:96:log_dist] [Rank 0] step=3760, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:49:25,564] [INFO] [timer.py:259:stop] epoch=3/micro_step=172/global_step=3760, RunningAvgSamplesPerSec=6.513817166577291, CurrSamplesPerSec=7.280275986117298, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:50:17,062] [INFO] [logging.py:96:log_dist] [Rank 0] step=3770, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:50:17,063] [INFO] [timer.py:259:stop] epoch=3/micro_step=182/global_step=3770, RunningAvgSamplesPerSec=6.513819230245731, CurrSamplesPerSec=4.88426803647925, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 02:51:07,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=3780, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:51:07,190] [INFO] [timer.py:259:stop] epoch=3/micro_step=192/global_step=3780, RunningAvgSamplesPerSec=6.514384504085321, CurrSamplesPerSec=6.848876818717633, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:51:59,447] [INFO] [logging.py:96:log_dist] [Rank 0] step=3790, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:51:59,448] [INFO] [timer.py:259:stop] epoch=3/micro_step=202/global_step=3790, RunningAvgSamplesPerSec=6.514228037695708, CurrSamplesPerSec=6.443656930282968, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 02:52:51,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=3800, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:52:51,227] [INFO] [timer.py:259:stop] epoch=3/micro_step=212/global_step=3800, RunningAvgSamplesPerSec=6.5142340680258135, CurrSamplesPerSec=6.8893968047359655, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:53:44,121] [INFO] [logging.py:96:log_dist] [Rank 0] step=3810, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:53:44,122] [INFO] [timer.py:259:stop] epoch=3/micro_step=222/global_step=3810, RunningAvgSamplesPerSec=6.513830841500784, CurrSamplesPerSec=6.517420932620713, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 02:54:38,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=3820, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:54:38,061] [INFO] [timer.py:259:stop] epoch=3/micro_step=232/global_step=3820, RunningAvgSamplesPerSec=6.513031028032344, CurrSamplesPerSec=6.833124438348671, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:55:29,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=3830, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:55:29,518] [INFO] [timer.py:259:stop] epoch=3/micro_step=242/global_step=3830, RunningAvgSamplesPerSec=6.513191982867551, CurrSamplesPerSec=7.187705409801208, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:56:19,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=3840, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:56:19,318] [INFO] [timer.py:259:stop] epoch=3/micro_step=252/global_step=3840, RunningAvgSamplesPerSec=6.5139390077382835, CurrSamplesPerSec=6.420776397646033, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 02:57:09,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=3850, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:57:09,527] [INFO] [timer.py:259:stop] epoch=3/micro_step=262/global_step=3850, RunningAvgSamplesPerSec=6.5143846350765084, CurrSamplesPerSec=7.260461911533338, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:58:02,605] [INFO] [logging.py:96:log_dist] [Rank 0] step=3860, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:58:02,605] [INFO] [timer.py:259:stop] epoch=3/micro_step=272/global_step=3860, RunningAvgSamplesPerSec=6.513960100886721, CurrSamplesPerSec=5.636641651379105, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:58:54,944] [INFO] [logging.py:96:log_dist] [Rank 0] step=3870, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:58:54,944] [INFO] [timer.py:259:stop] epoch=3/micro_step=282/global_step=3870, RunningAvgSamplesPerSec=6.513695570998896, CurrSamplesPerSec=6.276375088386073, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 02:59:46,086] [INFO] [logging.py:96:log_dist] [Rank 0] step=3880, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 02:59:46,087] [INFO] [timer.py:259:stop] epoch=3/micro_step=292/global_step=3880, RunningAvgSamplesPerSec=6.513967620784877, CurrSamplesPerSec=6.837058435504169, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:00:39,309] [INFO] [logging.py:96:log_dist] [Rank 0] step=3890, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:00:39,310] [INFO] [timer.py:259:stop] epoch=3/micro_step=302/global_step=3890, RunningAvgSamplesPerSec=6.513441852521109, CurrSamplesPerSec=6.328045890209956, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:01:32,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=3900, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:01:32,473] [INFO] [timer.py:259:stop] epoch=3/micro_step=312/global_step=3900, RunningAvgSamplesPerSec=6.51292375507179, CurrSamplesPerSec=6.433919917953123, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:02:23,127] [INFO] [logging.py:96:log_dist] [Rank 0] step=3910, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:02:23,128] [INFO] [timer.py:259:stop] epoch=3/micro_step=322/global_step=3910, RunningAvgSamplesPerSec=6.513296251675087, CurrSamplesPerSec=7.1719222347016744, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:03:14,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=3920, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:03:14,101] [INFO] [timer.py:259:stop] epoch=3/micro_step=332/global_step=3920, RunningAvgSamplesPerSec=6.513514918214403, CurrSamplesPerSec=6.213373162463607, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:04:07,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=3930, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:04:07,817] [INFO] [timer.py:259:stop] epoch=3/micro_step=342/global_step=3930, RunningAvgSamplesPerSec=6.5128126529155645, CurrSamplesPerSec=6.616414196437096, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:04:58,499] [INFO] [logging.py:96:log_dist] [Rank 0] step=3940, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:04:58,499] [INFO] [timer.py:259:stop] epoch=3/micro_step=352/global_step=3940, RunningAvgSamplesPerSec=6.513084739146665, CurrSamplesPerSec=6.64300807891153, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:05:48,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=3950, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:05:48,738] [INFO] [timer.py:259:stop] epoch=3/micro_step=362/global_step=3950, RunningAvgSamplesPerSec=6.513557712427054, CurrSamplesPerSec=6.701968360689885, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:06:40,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=3960, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:06:40,098] [INFO] [timer.py:259:stop] epoch=3/micro_step=372/global_step=3960, RunningAvgSamplesPerSec=6.513728530432179, CurrSamplesPerSec=6.18462742464251, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:07:30,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=3970, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:07:30,716] [INFO] [timer.py:259:stop] epoch=3/micro_step=382/global_step=3970, RunningAvgSamplesPerSec=6.514110531842834, CurrSamplesPerSec=6.5021828479762185, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 03:08:20,065] [INFO] [logging.py:96:log_dist] [Rank 0] step=3980, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:08:20,066] [INFO] [timer.py:259:stop] epoch=3/micro_step=392/global_step=3980, RunningAvgSamplesPerSec=6.51486169333884, CurrSamplesPerSec=7.1663719751530195, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:09:13,963] [INFO] [logging.py:96:log_dist] [Rank 0] step=3990, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:09:13,964] [INFO] [timer.py:259:stop] epoch=3/micro_step=402/global_step=3990, RunningAvgSamplesPerSec=6.5140671146718505, CurrSamplesPerSec=6.6461261499204225, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 03:10:07,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=4000, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:10:07,045] [INFO] [timer.py:259:stop] epoch=3/micro_step=412/global_step=4000, RunningAvgSamplesPerSec=6.5135743283673735, CurrSamplesPerSec=7.099100650080468, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:10:56,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=4010, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:10:56,502] [INFO] [timer.py:259:stop] epoch=3/micro_step=422/global_step=4010, RunningAvgSamplesPerSec=6.5142990929215525, CurrSamplesPerSec=7.182307112801109, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:11:46,922] [INFO] [logging.py:96:log_dist] [Rank 0] step=4020, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:11:46,923] [INFO] [timer.py:259:stop] epoch=3/micro_step=432/global_step=4020, RunningAvgSamplesPerSec=6.514762869551188, CurrSamplesPerSec=7.321850851302871, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:12:38,555] [INFO] [logging.py:96:log_dist] [Rank 0] step=4030, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:12:38,556] [INFO] [timer.py:259:stop] epoch=3/micro_step=442/global_step=4030, RunningAvgSamplesPerSec=6.514782025593831, CurrSamplesPerSec=6.697554517786538, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:13:30,302] [INFO] [logging.py:96:log_dist] [Rank 0] step=4040, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:13:30,303] [INFO] [timer.py:259:stop] epoch=3/micro_step=452/global_step=4040, RunningAvgSamplesPerSec=6.514748652660218, CurrSamplesPerSec=6.363457167312536, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:14:20,206] [INFO] [logging.py:96:log_dist] [Rank 0] step=4050, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:14:20,206] [INFO] [timer.py:259:stop] epoch=3/micro_step=462/global_step=4050, RunningAvgSamplesPerSec=6.51545217637484, CurrSamplesPerSec=7.080758981599538, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:15:11,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=4060, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:15:11,822] [INFO] [timer.py:259:stop] epoch=3/micro_step=472/global_step=4060, RunningAvgSamplesPerSec=6.515534802934547, CurrSamplesPerSec=7.013293223706789, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:16:02,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=4070, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:16:02,730] [INFO] [timer.py:259:stop] epoch=3/micro_step=482/global_step=4070, RunningAvgSamplesPerSec=6.515767018679566, CurrSamplesPerSec=7.513435435763654, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:16:55,110] [INFO] [logging.py:96:log_dist] [Rank 0] step=4080, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:16:55,110] [INFO] [timer.py:259:stop] epoch=3/micro_step=492/global_step=4080, RunningAvgSamplesPerSec=6.515571665830849, CurrSamplesPerSec=6.632617469848105, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:17:47,482] [INFO] [logging.py:96:log_dist] [Rank 0] step=4090, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:17:47,483] [INFO] [timer.py:259:stop] epoch=3/micro_step=502/global_step=4090, RunningAvgSamplesPerSec=6.515392019889172, CurrSamplesPerSec=6.44021041201497, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:18:37,921] [INFO] [logging.py:96:log_dist] [Rank 0] step=4100, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:18:37,922] [INFO] [timer.py:259:stop] epoch=3/micro_step=512/global_step=4100, RunningAvgSamplesPerSec=6.515718092820711, CurrSamplesPerSec=7.827965128454701, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:19:32,056] [INFO] [logging.py:96:log_dist] [Rank 0] step=4110, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:19:32,057] [INFO] [timer.py:259:stop] epoch=3/micro_step=522/global_step=4110, RunningAvgSamplesPerSec=6.514922951659305, CurrSamplesPerSec=6.993392827391669, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:20:23,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=4120, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:20:23,938] [INFO] [timer.py:259:stop] epoch=3/micro_step=532/global_step=4120, RunningAvgSamplesPerSec=6.514877226065281, CurrSamplesPerSec=7.041472923121869, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:21:14,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=4130, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:21:14,227] [INFO] [timer.py:259:stop] epoch=3/micro_step=542/global_step=4130, RunningAvgSamplesPerSec=6.515361769601305, CurrSamplesPerSec=7.194853164615649, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:22:07,098] [INFO] [logging.py:96:log_dist] [Rank 0] step=4140, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:22:07,099] [INFO] [timer.py:259:stop] epoch=3/micro_step=552/global_step=4140, RunningAvgSamplesPerSec=6.514997507300966, CurrSamplesPerSec=5.897226795715332, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 03:22:59,053] [INFO] [logging.py:96:log_dist] [Rank 0] step=4150, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:22:59,054] [INFO] [timer.py:259:stop] epoch=3/micro_step=562/global_step=4150, RunningAvgSamplesPerSec=6.514937518032245, CurrSamplesPerSec=6.577639475539208, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:23:49,852] [INFO] [logging.py:96:log_dist] [Rank 0] step=4160, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:23:49,853] [INFO] [timer.py:259:stop] epoch=3/micro_step=572/global_step=4160, RunningAvgSamplesPerSec=6.515265395498821, CurrSamplesPerSec=6.462228197090764, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:24:43,749] [INFO] [logging.py:96:log_dist] [Rank 0] step=4170, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:24:43,750] [INFO] [timer.py:259:stop] epoch=3/micro_step=582/global_step=4170, RunningAvgSamplesPerSec=6.5145711722245085, CurrSamplesPerSec=6.1258350909187875, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:25:34,944] [INFO] [logging.py:96:log_dist] [Rank 0] step=4180, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:25:34,945] [INFO] [timer.py:259:stop] epoch=3/micro_step=592/global_step=4180, RunningAvgSamplesPerSec=6.514718173760628, CurrSamplesPerSec=5.755474670966618, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +Saving checkpoint at step 4186 ... +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-24 03:26:21,427] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step4186 is about to be saved! +[2024-10-24 03:26:21,428] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4186.bin, tag: global_step4186 +[2024-10-24 03:26:21,428] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4186.bin... +[2024-10-24 03:26:48,192] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4186.bin. +[2024-10-24 03:26:48,194] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step4186 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-24 03:27:01,413] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step4186 is about to be saved! +[2024-10-24 03:27:01,414] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4186.bin, tag: global_step4186 +[2024-10-24 03:27:01,414] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4186.bin... +[2024-10-24 03:27:34,958] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4186.bin. +[2024-10-24 03:27:34,960] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step4186 is ready now! +Model saved! +Checkpoint saved. +[2024-10-24 03:27:54,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=4190, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:27:54,618] [INFO] [timer.py:259:stop] epoch=3/micro_step=602/global_step=4190, RunningAvgSamplesPerSec=6.51426428116346, CurrSamplesPerSec=6.033583418002905, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:28:43,951] [INFO] [logging.py:96:log_dist] [Rank 0] step=4200, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:28:43,951] [INFO] [timer.py:259:stop] epoch=3/micro_step=612/global_step=4200, RunningAvgSamplesPerSec=6.515006275634361, CurrSamplesPerSec=6.002661969122877, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:29:33,006] [INFO] [logging.py:96:log_dist] [Rank 0] step=4210, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:29:33,007] [INFO] [timer.py:259:stop] epoch=3/micro_step=622/global_step=4210, RunningAvgSamplesPerSec=6.515893619493782, CurrSamplesPerSec=6.402827116728114, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:30:26,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=4220, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:30:26,262] [INFO] [timer.py:259:stop] epoch=3/micro_step=632/global_step=4220, RunningAvgSamplesPerSec=6.515414099100054, CurrSamplesPerSec=5.414765557614286, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:31:17,692] [INFO] [logging.py:96:log_dist] [Rank 0] step=4230, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:31:17,693] [INFO] [timer.py:259:stop] epoch=3/micro_step=642/global_step=4230, RunningAvgSamplesPerSec=6.515348749135267, CurrSamplesPerSec=7.1947220337316455, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:32:10,614] [INFO] [logging.py:96:log_dist] [Rank 0] step=4240, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:32:10,615] [INFO] [timer.py:259:stop] epoch=3/micro_step=652/global_step=4240, RunningAvgSamplesPerSec=6.514948726731995, CurrSamplesPerSec=5.9018104008112395, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:33:01,239] [INFO] [logging.py:96:log_dist] [Rank 0] step=4250, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:33:01,239] [INFO] [timer.py:259:stop] epoch=3/micro_step=662/global_step=4250, RunningAvgSamplesPerSec=6.5151843225139325, CurrSamplesPerSec=6.901310040102929, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:33:54,497] [INFO] [logging.py:96:log_dist] [Rank 0] step=4260, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:33:54,497] [INFO] [timer.py:259:stop] epoch=3/micro_step=672/global_step=4260, RunningAvgSamplesPerSec=6.514787615246211, CurrSamplesPerSec=6.8529318489069855, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 03:34:47,806] [INFO] [logging.py:96:log_dist] [Rank 0] step=4270, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:34:47,806] [INFO] [timer.py:259:stop] epoch=3/micro_step=682/global_step=4270, RunningAvgSamplesPerSec=6.5142138758154005, CurrSamplesPerSec=5.901843099743773, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:35:39,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=4280, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:35:39,626] [INFO] [timer.py:259:stop] epoch=3/micro_step=692/global_step=4280, RunningAvgSamplesPerSec=6.514221718106773, CurrSamplesPerSec=6.2128149088987055, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:36:29,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=4290, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:36:29,158] [INFO] [timer.py:259:stop] epoch=3/micro_step=702/global_step=4290, RunningAvgSamplesPerSec=6.514841044613414, CurrSamplesPerSec=6.550011823469678, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:37:19,901] [INFO] [logging.py:96:log_dist] [Rank 0] step=4300, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:37:19,902] [INFO] [timer.py:259:stop] epoch=3/micro_step=712/global_step=4300, RunningAvgSamplesPerSec=6.515199304846144, CurrSamplesPerSec=5.757888439953852, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:38:09,829] [INFO] [logging.py:96:log_dist] [Rank 0] step=4310, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:38:09,830] [INFO] [timer.py:259:stop] epoch=3/micro_step=722/global_step=4310, RunningAvgSamplesPerSec=6.515731656260229, CurrSamplesPerSec=7.589797386823443, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:39:01,674] [INFO] [logging.py:96:log_dist] [Rank 0] step=4320, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:39:01,674] [INFO] [timer.py:259:stop] epoch=3/micro_step=732/global_step=4320, RunningAvgSamplesPerSec=6.515739623522393, CurrSamplesPerSec=6.49764720080283, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:39:54,381] [INFO] [logging.py:96:log_dist] [Rank 0] step=4330, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:39:54,382] [INFO] [timer.py:259:stop] epoch=3/micro_step=742/global_step=4330, RunningAvgSamplesPerSec=6.515337942977928, CurrSamplesPerSec=6.443106328228334, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:40:45,828] [INFO] [logging.py:96:log_dist] [Rank 0] step=4340, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:40:45,829] [INFO] [timer.py:259:stop] epoch=3/micro_step=752/global_step=4340, RunningAvgSamplesPerSec=6.515379351569981, CurrSamplesPerSec=6.153247373967097, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:41:38,648] [INFO] [logging.py:96:log_dist] [Rank 0] step=4350, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:41:38,649] [INFO] [timer.py:259:stop] epoch=3/micro_step=762/global_step=4350, RunningAvgSamplesPerSec=6.515017113128443, CurrSamplesPerSec=6.850323643666011, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:42:28,444] [INFO] [logging.py:96:log_dist] [Rank 0] step=4360, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:42:28,444] [INFO] [timer.py:259:stop] epoch=3/micro_step=772/global_step=4360, RunningAvgSamplesPerSec=6.515541456025932, CurrSamplesPerSec=7.177181726434207, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:43:20,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=4370, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:43:20,766] [INFO] [timer.py:259:stop] epoch=3/micro_step=782/global_step=4370, RunningAvgSamplesPerSec=6.515265729997006, CurrSamplesPerSec=5.584488809859269, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 03:44:12,444] [INFO] [logging.py:96:log_dist] [Rank 0] step=4380, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:44:12,445] [INFO] [timer.py:259:stop] epoch=3/micro_step=792/global_step=4380, RunningAvgSamplesPerSec=6.515230618683387, CurrSamplesPerSec=7.764135121112175, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:45:05,951] [INFO] [logging.py:96:log_dist] [Rank 0] step=4390, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:45:05,951] [INFO] [timer.py:259:stop] epoch=3/micro_step=802/global_step=4390, RunningAvgSamplesPerSec=6.5146057942590945, CurrSamplesPerSec=6.834396522395728, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:45:56,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=4400, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:45:56,941] [INFO] [timer.py:259:stop] epoch=3/micro_step=812/global_step=4400, RunningAvgSamplesPerSec=6.514866982032961, CurrSamplesPerSec=6.768976676731244, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:46:48,111] [INFO] [logging.py:96:log_dist] [Rank 0] step=4410, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:46:48,111] [INFO] [timer.py:259:stop] epoch=3/micro_step=822/global_step=4410, RunningAvgSamplesPerSec=6.515290373076935, CurrSamplesPerSec=6.911277439972059, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 03:47:40,672] [INFO] [logging.py:96:log_dist] [Rank 0] step=4420, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:47:40,673] [INFO] [timer.py:259:stop] epoch=3/micro_step=832/global_step=4420, RunningAvgSamplesPerSec=6.515104961389886, CurrSamplesPerSec=6.2536273602878705, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:48:34,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=4430, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:48:34,851] [INFO] [timer.py:259:stop] epoch=3/micro_step=842/global_step=4430, RunningAvgSamplesPerSec=6.514342339615121, CurrSamplesPerSec=4.912426411012275, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:49:25,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=4440, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:49:25,502] [INFO] [timer.py:259:stop] epoch=3/micro_step=852/global_step=4440, RunningAvgSamplesPerSec=6.514591671254842, CurrSamplesPerSec=6.359321427985999, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:50:16,795] [INFO] [logging.py:96:log_dist] [Rank 0] step=4450, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:50:16,796] [INFO] [timer.py:259:stop] epoch=3/micro_step=862/global_step=4450, RunningAvgSamplesPerSec=6.514698511903131, CurrSamplesPerSec=6.794281038023156, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:51:09,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=4460, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:51:09,506] [INFO] [timer.py:259:stop] epoch=3/micro_step=872/global_step=4460, RunningAvgSamplesPerSec=6.5144770435482515, CurrSamplesPerSec=5.257546872887176, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:52:01,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=4470, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:52:01,178] [INFO] [timer.py:259:stop] epoch=3/micro_step=882/global_step=4470, RunningAvgSamplesPerSec=6.514424991988626, CurrSamplesPerSec=6.82232873244781, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:52:54,519] [INFO] [logging.py:96:log_dist] [Rank 0] step=4480, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:52:54,519] [INFO] [timer.py:259:stop] epoch=3/micro_step=892/global_step=4480, RunningAvgSamplesPerSec=6.513843260603692, CurrSamplesPerSec=4.808322725875919, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:53:44,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=4490, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:53:44,918] [INFO] [timer.py:259:stop] epoch=3/micro_step=902/global_step=4490, RunningAvgSamplesPerSec=6.514208444206475, CurrSamplesPerSec=6.097746358676201, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 03:54:36,857] [INFO] [logging.py:96:log_dist] [Rank 0] step=4500, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:54:36,858] [INFO] [timer.py:259:stop] epoch=3/micro_step=912/global_step=4500, RunningAvgSamplesPerSec=6.514127302145134, CurrSamplesPerSec=6.742004392875616, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 03:55:27,754] [INFO] [logging.py:96:log_dist] [Rank 0] step=4510, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:55:27,754] [INFO] [timer.py:259:stop] epoch=3/micro_step=922/global_step=4510, RunningAvgSamplesPerSec=6.514302330552216, CurrSamplesPerSec=6.969669100540992, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:56:20,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=4520, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:56:20,623] [INFO] [timer.py:259:stop] epoch=3/micro_step=932/global_step=4520, RunningAvgSamplesPerSec=6.514010932176474, CurrSamplesPerSec=6.515821535798148, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:57:12,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=4530, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:57:12,665] [INFO] [timer.py:259:stop] epoch=3/micro_step=942/global_step=4530, RunningAvgSamplesPerSec=6.5139887606058995, CurrSamplesPerSec=6.403457923735085, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:58:03,076] [INFO] [logging.py:96:log_dist] [Rank 0] step=4540, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:58:03,077] [INFO] [timer.py:259:stop] epoch=3/micro_step=952/global_step=4540, RunningAvgSamplesPerSec=6.514258220507552, CurrSamplesPerSec=6.679997691775831, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 03:58:55,913] [INFO] [logging.py:96:log_dist] [Rank 0] step=4550, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:58:55,913] [INFO] [timer.py:259:stop] epoch=3/micro_step=962/global_step=4550, RunningAvgSamplesPerSec=6.513854589139116, CurrSamplesPerSec=6.579466751295439, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 03:59:46,374] [INFO] [logging.py:96:log_dist] [Rank 0] step=4560, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 03:59:46,375] [INFO] [timer.py:259:stop] epoch=3/micro_step=972/global_step=4560, RunningAvgSamplesPerSec=6.514242334454906, CurrSamplesPerSec=6.379138900545572, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:00:36,419] [INFO] [logging.py:96:log_dist] [Rank 0] step=4570, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:00:36,420] [INFO] [timer.py:259:stop] epoch=3/micro_step=982/global_step=4570, RunningAvgSamplesPerSec=6.5147352327004, CurrSamplesPerSec=7.075868098836216, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 04:01:27,214] [INFO] [logging.py:96:log_dist] [Rank 0] step=4580, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:01:27,215] [INFO] [timer.py:259:stop] epoch=3/micro_step=992/global_step=4580, RunningAvgSamplesPerSec=6.515048048995933, CurrSamplesPerSec=7.034897162972759, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 04:02:20,309] [INFO] [logging.py:96:log_dist] [Rank 0] step=4590, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:02:20,310] [INFO] [timer.py:259:stop] epoch=3/micro_step=1002/global_step=4590, RunningAvgSamplesPerSec=6.51461637339516, CurrSamplesPerSec=5.8367361692688595, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:03:12,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=4600, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:03:12,946] [INFO] [timer.py:259:stop] epoch=3/micro_step=1012/global_step=4600, RunningAvgSamplesPerSec=6.514318292130164, CurrSamplesPerSec=6.742916536623486, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 04:04:03,695] [INFO] [logging.py:96:log_dist] [Rank 0] step=4610, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:04:03,696] [INFO] [timer.py:259:stop] epoch=3/micro_step=1022/global_step=4610, RunningAvgSamplesPerSec=6.514612035442464, CurrSamplesPerSec=6.4413654365515685, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:04:54,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=4620, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:04:54,961] [INFO] [timer.py:259:stop] epoch=3/micro_step=1032/global_step=4620, RunningAvgSamplesPerSec=6.514698422802994, CurrSamplesPerSec=6.009540512310688, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 04:05:47,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=4630, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:05:47,818] [INFO] [timer.py:259:stop] epoch=3/micro_step=1042/global_step=4630, RunningAvgSamplesPerSec=6.514394365430168, CurrSamplesPerSec=6.4074977278346354, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 04:06:39,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=4640, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:06:39,916] [INFO] [timer.py:259:stop] epoch=3/micro_step=1052/global_step=4640, RunningAvgSamplesPerSec=6.51428087453139, CurrSamplesPerSec=5.88384899744672, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 04:07:32,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=4650, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:07:32,023] [INFO] [timer.py:259:stop] epoch=3/micro_step=1062/global_step=4650, RunningAvgSamplesPerSec=6.514115611026878, CurrSamplesPerSec=6.500224462382347, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:08:23,542] [INFO] [logging.py:96:log_dist] [Rank 0] step=4660, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:08:23,542] [INFO] [timer.py:259:stop] epoch=3/micro_step=1072/global_step=4660, RunningAvgSamplesPerSec=6.514112033350708, CurrSamplesPerSec=7.2674100567190925, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:09:14,609] [INFO] [logging.py:96:log_dist] [Rank 0] step=4670, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:09:14,609] [INFO] [timer.py:259:stop] epoch=3/micro_step=1082/global_step=4670, RunningAvgSamplesPerSec=6.514221995260599, CurrSamplesPerSec=7.33141350186495, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:10:05,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=4680, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:10:05,962] [INFO] [timer.py:259:stop] epoch=3/micro_step=1092/global_step=4680, RunningAvgSamplesPerSec=6.514389529205228, CurrSamplesPerSec=6.652989030972296, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:11:00,277] [INFO] [logging.py:96:log_dist] [Rank 0] step=4690, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:11:00,278] [INFO] [timer.py:259:stop] epoch=3/micro_step=1102/global_step=4690, RunningAvgSamplesPerSec=6.51366223750045, CurrSamplesPerSec=5.8300897874743915, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 04:11:50,845] [INFO] [logging.py:96:log_dist] [Rank 0] step=4700, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:11:50,846] [INFO] [timer.py:259:stop] epoch=3/micro_step=1112/global_step=4700, RunningAvgSamplesPerSec=6.513952728412473, CurrSamplesPerSec=6.34938264980232, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:12:42,462] [INFO] [logging.py:96:log_dist] [Rank 0] step=4710, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:12:42,463] [INFO] [timer.py:259:stop] epoch=3/micro_step=1122/global_step=4710, RunningAvgSamplesPerSec=6.514055317447724, CurrSamplesPerSec=5.868094733508076, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 04:13:33,305] [INFO] [logging.py:96:log_dist] [Rank 0] step=4720, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:13:33,306] [INFO] [timer.py:259:stop] epoch=3/micro_step=1132/global_step=4720, RunningAvgSamplesPerSec=6.514379071488735, CurrSamplesPerSec=6.84744492766819, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 04:14:23,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=4730, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:14:23,446] [INFO] [timer.py:259:stop] epoch=3/micro_step=1142/global_step=4730, RunningAvgSamplesPerSec=6.514796796716388, CurrSamplesPerSec=7.383656293467518, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 04:15:15,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=4740, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:15:15,158] [INFO] [timer.py:259:stop] epoch=3/micro_step=1152/global_step=4740, RunningAvgSamplesPerSec=6.5148401114244745, CurrSamplesPerSec=6.710500481028486, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:16:05,914] [INFO] [logging.py:96:log_dist] [Rank 0] step=4750, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:16:05,914] [INFO] [timer.py:259:stop] epoch=3/micro_step=1162/global_step=4750, RunningAvgSamplesPerSec=6.5151324418751715, CurrSamplesPerSec=6.868579297940065, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:16:56,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=4760, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:16:56,263] [INFO] [timer.py:259:stop] epoch=3/micro_step=1172/global_step=4760, RunningAvgSamplesPerSec=6.5154779154374305, CurrSamplesPerSec=6.286204506433742, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:17:48,413] [INFO] [logging.py:96:log_dist] [Rank 0] step=4770, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:17:48,414] [INFO] [timer.py:259:stop] epoch=3/micro_step=1182/global_step=4770, RunningAvgSamplesPerSec=6.5152491405710204, CurrSamplesPerSec=7.096652557777577, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 04:18:41,347] [INFO] [logging.py:96:log_dist] [Rank 0] step=4780, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:18:41,347] [INFO] [timer.py:259:stop] epoch=3/micro_step=1192/global_step=4780, RunningAvgSamplesPerSec=6.514887292202939, CurrSamplesPerSec=5.883365663761515, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +Saving checkpoint at step 4784 ... +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-24 04:19:12,352] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step4784 is about to be saved! +[2024-10-24 04:19:12,353] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4784.bin, tag: global_step4784 +[2024-10-24 04:19:12,353] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4784.bin... +[2024-10-24 04:19:41,147] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4784.bin. +[2024-10-24 04:19:41,149] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step4784 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-24 04:19:57,666] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step4784 is about to be saved! +[2024-10-24 04:19:57,666] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4784.bin, tag: global_step4784 +[2024-10-24 04:19:57,667] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4784.bin... +[2024-10-24 04:20:39,163] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_4784.bin. +[2024-10-24 04:20:39,164] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step4784 is ready now! +Model saved! +Checkpoint saved. + +***** Evaluating at epoch 4/5.0 ***** + +***** Evaluating at the beginning ***** +Evaluation: accuracy = 0.644477, reward_mean = 6.962779, reward_std = 3.338448 +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ +┃ prompt  ┃ higher-reward response  ┃ reward  ┃ lower-reward response  ┃ reward  ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ +│ USER: │ There are two bags visible in the image. │ 10.562500 │ There are four bags visible in the │ 10.312500 │ +│ How many bags are visible in the image? │ │ │ image. │ │ +│ │ │ │ │ │ +├──────────────────────────────────────────┼──────────────────────────────────────────┼───────────┼─────────────────────────────────────────┼───────────┤ +│ USER: │ Based on the signs and advertisements │ 6.218750 │ Based on the presence of multiple │ 3.640625 │ +│ What can you infer about the purpose of │ visible in the image, it can be inferred │ │ signs, including a large red sign that │ │ +│ the building with the pyramid-shaped │ that the building with the │ │ reads "Sale," it is likely that this │ │ +│ structure based on the signs and │ pyramid-shaped structure is likely a │ │ building serves as an outlet for retail │ │ +│ advertisements? │ shopping center or a commercial complex. │ │ or commercial purposes. The pyramid │ │ +│ │ The signs include names of various │ │ shape suggests that there might be some │ │ +│ │ brands and stores, such as "Ochsner," │ │ architectural significance to the │ │ +│ │ "Manor," and "Traita fina," which are │ │ design, but without more context, we │ │ +│ │ typically associated with retail and │ │ cannot definitively determine its │ │ +│ │ shopping. Additionally, the large text │ │ specific function. However, given the │ │ +│ │ "Shopping Center St. Jakob-Park" further │ │ promotional nature of the signs, one │ │ +│ │ supports the idea that this building is │ │ could reasonably assume that the │ │ +│ │ a shopping center. The prominent display │ │ building houses shops or businesses │ │ +│ │ of the word "BASEL" suggests that this │ │ offering discounts or special deals to │ │ +│ │ shopping center is located in or near │ │ attract customers. │ │ +│ │ Basel, Switzerland. │ │ │ │ +├──────────────────────────────────────────┼──────────────────────────────────────────┼───────────┼─────────────────────────────────────────┼───────────┤ +│ USER: │ Based on the lighting and shadows in the │ 7.468750 │ The image shows a plane at an airport │ 6.218750 │ +│ Based on the lighting and shadows, can │ image, it appears that the sun is │ │ terminal with lights in the distance. │ │ +│ you infer if the sun is rising or │ setting. The sky has a gradient from a │ │ The sky has hues of yellow and orange │ │ +│ setting? │ darker blue at the top to a lighter │ │ near the horizon which suggests that │ │ +│ │ yellow-orange near the horizon, which is │ │ it's either sunrise or sunset. However, │ │ +│ │ typical of sunset. Additionally, the │ │ there are no direct sunlight rays │ │ +│ │ lights at the airport are on, which is │ │ visible, so we cannot determine whether │ │ +│ │ more common during the evening as it │ │ this photo was taken during dawn │ │ +│ │ gets darker. │ │ (sunrise) or dusk (sunset). It could be │ │ +│ │ │ │ either one depending on where the │ │ +│ │ │ │ photographer took the picture from. If │ │ +│ │ │ │ they were standing facing east when │ │ +│ │ │ │ taking the shot, then it would likely │ │ +│ │ │ │ be morning; but if they faced west │ │ +│ │ │ │ while capturing the scene, it might │ │ +│ │ │ │ have been evening instead. │ │ +│ │ │ │ │ │ +│ │ │ │ It appears to be early morning based │ │ +│ │ │ │ off how dark everything else looks │ │ +│ │ │ │ except for those areas around the │ │ +│ │ │ │ horizon line indicating light coming up │ │ +│ │ │ │ over mountains behind them - suggesting │ │ +│ │ │ │ daytime hours ahead rather than │ │ +│ │ │ │ nighttime ones since all other parts │ │ +│ │ │ │ remain shadowed by darkness still │ │ +│ │ │ │ lingering outside city limits below us │ │ +│ │ │ │ here today... │ │ +└──────────────────────────────────────────┴──────────────────────────────────────────┴───────────┴─────────────────────────────────────────┴───────────┘ +[2024-10-24 04:41:24,518] [WARNING] [stage3.py:2102:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time +[2024-10-24 04:41:50,613] [INFO] [logging.py:96:log_dist] [Rank 0] step=4790, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:41:50,614] [INFO] [timer.py:259:stop] epoch=4/micro_step=6/global_step=4790, RunningAvgSamplesPerSec=6.51550432992297, CurrSamplesPerSec=7.607793439625185, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:42:40,981] [INFO] [logging.py:96:log_dist] [Rank 0] step=4800, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:42:40,982] [INFO] [timer.py:259:stop] epoch=4/micro_step=16/global_step=4800, RunningAvgSamplesPerSec=6.515974649815899, CurrSamplesPerSec=7.2667502099765455, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:43:34,421] [INFO] [logging.py:96:log_dist] [Rank 0] step=4810, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:43:34,422] [INFO] [timer.py:259:stop] epoch=4/micro_step=26/global_step=4810, RunningAvgSamplesPerSec=6.515519575766837, CurrSamplesPerSec=6.737048990598311, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:44:26,784] [INFO] [logging.py:96:log_dist] [Rank 0] step=4820, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:44:26,785] [INFO] [timer.py:259:stop] epoch=4/micro_step=36/global_step=4820, RunningAvgSamplesPerSec=6.515205483554594, CurrSamplesPerSec=5.762532534701873, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 04:45:21,318] [INFO] [logging.py:96:log_dist] [Rank 0] step=4830, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:45:21,318] [INFO] [timer.py:259:stop] epoch=4/micro_step=46/global_step=4830, RunningAvgSamplesPerSec=6.514455955009127, CurrSamplesPerSec=5.978201547214852, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:46:13,437] [INFO] [logging.py:96:log_dist] [Rank 0] step=4840, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:46:13,438] [INFO] [timer.py:259:stop] epoch=4/micro_step=56/global_step=4840, RunningAvgSamplesPerSec=6.514349612537549, CurrSamplesPerSec=6.7649794481936, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 04:47:04,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=4850, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:47:04,411] [INFO] [timer.py:259:stop] epoch=4/micro_step=66/global_step=4850, RunningAvgSamplesPerSec=6.514534348677044, CurrSamplesPerSec=6.1595411887022, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 04:47:55,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=4860, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:47:55,258] [INFO] [timer.py:259:stop] epoch=4/micro_step=76/global_step=4860, RunningAvgSamplesPerSec=6.514743872632178, CurrSamplesPerSec=6.405620411708745, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:48:48,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=4870, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:48:48,446] [INFO] [timer.py:259:stop] epoch=4/micro_step=86/global_step=4870, RunningAvgSamplesPerSec=6.514343643471506, CurrSamplesPerSec=6.906816485575945, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:49:38,549] [INFO] [logging.py:96:log_dist] [Rank 0] step=4880, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:49:38,550] [INFO] [timer.py:259:stop] epoch=4/micro_step=96/global_step=4880, RunningAvgSamplesPerSec=6.514871816462659, CurrSamplesPerSec=6.500111447998317, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:50:28,245] [INFO] [logging.py:96:log_dist] [Rank 0] step=4890, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:50:28,246] [INFO] [timer.py:259:stop] epoch=4/micro_step=106/global_step=4890, RunningAvgSamplesPerSec=6.515447097689247, CurrSamplesPerSec=6.827015228107332, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 04:51:20,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=4900, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:51:20,051] [INFO] [timer.py:259:stop] epoch=4/micro_step=116/global_step=4900, RunningAvgSamplesPerSec=6.515374023434758, CurrSamplesPerSec=5.997875943511256, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 04:52:14,159] [INFO] [logging.py:96:log_dist] [Rank 0] step=4910, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:52:14,159] [INFO] [timer.py:259:stop] epoch=4/micro_step=126/global_step=4910, RunningAvgSamplesPerSec=6.514871268319111, CurrSamplesPerSec=7.408765352690542, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 04:53:06,722] [INFO] [logging.py:96:log_dist] [Rank 0] step=4920, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:53:06,722] [INFO] [timer.py:259:stop] epoch=4/micro_step=136/global_step=4920, RunningAvgSamplesPerSec=6.514635497377001, CurrSamplesPerSec=7.281170541781506, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:53:59,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=4930, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:53:59,274] [INFO] [timer.py:259:stop] epoch=4/micro_step=146/global_step=4930, RunningAvgSamplesPerSec=6.514371208783495, CurrSamplesPerSec=6.370041048223885, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:54:49,653] [INFO] [logging.py:96:log_dist] [Rank 0] step=4940, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:54:49,654] [INFO] [timer.py:259:stop] epoch=4/micro_step=156/global_step=4940, RunningAvgSamplesPerSec=6.514708491620274, CurrSamplesPerSec=7.2521697896070245, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:55:40,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=4950, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:55:40,298] [INFO] [timer.py:259:stop] epoch=4/micro_step=166/global_step=4950, RunningAvgSamplesPerSec=6.514946638385245, CurrSamplesPerSec=6.9294598597232095, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 04:56:31,417] [INFO] [logging.py:96:log_dist] [Rank 0] step=4960, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:56:31,418] [INFO] [timer.py:259:stop] epoch=4/micro_step=176/global_step=4960, RunningAvgSamplesPerSec=6.515127157417266, CurrSamplesPerSec=6.6173001753861795, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:57:23,321] [INFO] [logging.py:96:log_dist] [Rank 0] step=4970, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:57:23,322] [INFO] [timer.py:259:stop] epoch=4/micro_step=186/global_step=4970, RunningAvgSamplesPerSec=6.515144761563292, CurrSamplesPerSec=6.773421268716152, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 04:58:13,870] [INFO] [logging.py:96:log_dist] [Rank 0] step=4980, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:58:13,871] [INFO] [timer.py:259:stop] epoch=4/micro_step=196/global_step=4980, RunningAvgSamplesPerSec=6.515380303157778, CurrSamplesPerSec=5.787580091036271, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 04:59:07,673] [INFO] [logging.py:96:log_dist] [Rank 0] step=4990, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:59:07,673] [INFO] [timer.py:259:stop] epoch=4/micro_step=206/global_step=4990, RunningAvgSamplesPerSec=6.51487341036355, CurrSamplesPerSec=5.956007639489021, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 04:59:57,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=5000, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 04:59:57,779] [INFO] [timer.py:259:stop] epoch=4/micro_step=216/global_step=5000, RunningAvgSamplesPerSec=6.51525847655083, CurrSamplesPerSec=7.297083682262972, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:00:52,061] [INFO] [logging.py:96:log_dist] [Rank 0] step=5010, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:00:52,062] [INFO] [timer.py:259:stop] epoch=4/micro_step=226/global_step=5010, RunningAvgSamplesPerSec=6.514535979285112, CurrSamplesPerSec=6.399200785148305, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:01:44,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=5020, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:01:44,730] [INFO] [timer.py:259:stop] epoch=4/micro_step=236/global_step=5020, RunningAvgSamplesPerSec=6.514268003219484, CurrSamplesPerSec=6.519708278828805, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:02:34,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=5030, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:02:34,731] [INFO] [timer.py:259:stop] epoch=4/micro_step=246/global_step=5030, RunningAvgSamplesPerSec=6.514773071703017, CurrSamplesPerSec=6.467865351588933, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:03:26,175] [INFO] [logging.py:96:log_dist] [Rank 0] step=5040, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:03:26,176] [INFO] [timer.py:259:stop] epoch=4/micro_step=256/global_step=5040, RunningAvgSamplesPerSec=6.514801030439281, CurrSamplesPerSec=5.750839476646273, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:04:16,779] [INFO] [logging.py:96:log_dist] [Rank 0] step=5050, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:04:16,780] [INFO] [timer.py:259:stop] epoch=4/micro_step=266/global_step=5050, RunningAvgSamplesPerSec=6.515035210560673, CurrSamplesPerSec=6.220573864902977, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 05:05:09,406] [INFO] [logging.py:96:log_dist] [Rank 0] step=5060, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:05:09,406] [INFO] [timer.py:259:stop] epoch=4/micro_step=276/global_step=5060, RunningAvgSamplesPerSec=6.5147761990425375, CurrSamplesPerSec=5.99538375483785, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:06:00,439] [INFO] [logging.py:96:log_dist] [Rank 0] step=5070, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:06:00,440] [INFO] [timer.py:259:stop] epoch=4/micro_step=286/global_step=5070, RunningAvgSamplesPerSec=6.515001979469154, CurrSamplesPerSec=7.132879614597586, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:06:53,018] [INFO] [logging.py:96:log_dist] [Rank 0] step=5080, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:06:53,019] [INFO] [timer.py:259:stop] epoch=4/micro_step=296/global_step=5080, RunningAvgSamplesPerSec=6.514774529382083, CurrSamplesPerSec=6.81622322506571, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:07:46,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=5090, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:07:46,898] [INFO] [timer.py:259:stop] epoch=4/micro_step=306/global_step=5090, RunningAvgSamplesPerSec=6.514204620159872, CurrSamplesPerSec=5.14141474352278, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:08:38,642] [INFO] [logging.py:96:log_dist] [Rank 0] step=5100, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:08:38,642] [INFO] [timer.py:259:stop] epoch=4/micro_step=316/global_step=5100, RunningAvgSamplesPerSec=6.514144862659381, CurrSamplesPerSec=7.361257878833581, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:09:28,537] [INFO] [logging.py:96:log_dist] [Rank 0] step=5110, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:09:28,537] [INFO] [timer.py:259:stop] epoch=4/micro_step=326/global_step=5110, RunningAvgSamplesPerSec=6.5145752719571846, CurrSamplesPerSec=7.042624584947464, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:10:20,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=5120, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:10:20,546] [INFO] [timer.py:259:stop] epoch=4/micro_step=336/global_step=5120, RunningAvgSamplesPerSec=6.514517814398527, CurrSamplesPerSec=5.788391042207171, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:11:12,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=5130, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:11:12,737] [INFO] [timer.py:259:stop] epoch=4/micro_step=346/global_step=5130, RunningAvgSamplesPerSec=6.514372593001071, CurrSamplesPerSec=7.3624305060497965, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:12:04,287] [INFO] [logging.py:96:log_dist] [Rank 0] step=5140, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:12:04,288] [INFO] [timer.py:259:stop] epoch=4/micro_step=356/global_step=5140, RunningAvgSamplesPerSec=6.514280078928078, CurrSamplesPerSec=6.324001030328358, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:12:54,991] [INFO] [logging.py:96:log_dist] [Rank 0] step=5150, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:12:54,992] [INFO] [timer.py:259:stop] epoch=4/micro_step=366/global_step=5150, RunningAvgSamplesPerSec=6.514674051700182, CurrSamplesPerSec=6.372139877527113, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 05:13:46,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=5160, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:13:46,023] [INFO] [timer.py:259:stop] epoch=4/micro_step=376/global_step=5160, RunningAvgSamplesPerSec=6.5147722341695555, CurrSamplesPerSec=7.436883441748651, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:14:36,865] [INFO] [logging.py:96:log_dist] [Rank 0] step=5170, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:14:36,866] [INFO] [timer.py:259:stop] epoch=4/micro_step=386/global_step=5170, RunningAvgSamplesPerSec=6.515019391710682, CurrSamplesPerSec=6.816779550669636, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:15:27,128] [INFO] [logging.py:96:log_dist] [Rank 0] step=5180, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:15:27,129] [INFO] [timer.py:259:stop] epoch=4/micro_step=396/global_step=5180, RunningAvgSamplesPerSec=6.5152799954476315, CurrSamplesPerSec=6.346672388642724, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 05:16:21,981] [INFO] [logging.py:96:log_dist] [Rank 0] step=5190, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:16:21,982] [INFO] [timer.py:259:stop] epoch=4/micro_step=406/global_step=5190, RunningAvgSamplesPerSec=6.514446113638197, CurrSamplesPerSec=6.014066523792879, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:17:12,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=5200, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:17:12,965] [INFO] [timer.py:259:stop] epoch=4/micro_step=416/global_step=5200, RunningAvgSamplesPerSec=6.514689156092063, CurrSamplesPerSec=6.544443149898836, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:18:03,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=5210, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:18:03,231] [INFO] [timer.py:259:stop] epoch=4/micro_step=426/global_step=5210, RunningAvgSamplesPerSec=6.515018914208735, CurrSamplesPerSec=6.2175374860306905, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:18:54,525] [INFO] [logging.py:96:log_dist] [Rank 0] step=5220, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:18:54,526] [INFO] [timer.py:259:stop] epoch=4/micro_step=436/global_step=5220, RunningAvgSamplesPerSec=6.515233291424165, CurrSamplesPerSec=5.920669399413316, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:19:46,494] [INFO] [logging.py:96:log_dist] [Rank 0] step=5230, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:19:46,495] [INFO] [timer.py:259:stop] epoch=4/micro_step=446/global_step=5230, RunningAvgSamplesPerSec=6.515116335412709, CurrSamplesPerSec=6.934603578680759, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:20:38,849] [INFO] [logging.py:96:log_dist] [Rank 0] step=5240, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:20:38,850] [INFO] [timer.py:259:stop] epoch=4/micro_step=456/global_step=5240, RunningAvgSamplesPerSec=6.5150324371776085, CurrSamplesPerSec=6.497415378954601, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 05:21:27,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=5250, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:21:27,889] [INFO] [timer.py:259:stop] epoch=4/micro_step=466/global_step=5250, RunningAvgSamplesPerSec=6.515752321635457, CurrSamplesPerSec=5.745547591106361, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:22:18,918] [INFO] [logging.py:96:log_dist] [Rank 0] step=5260, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:22:18,918] [INFO] [timer.py:259:stop] epoch=4/micro_step=476/global_step=5260, RunningAvgSamplesPerSec=6.515890148027979, CurrSamplesPerSec=6.540592207447628, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:23:10,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=5270, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:23:10,638] [INFO] [timer.py:259:stop] epoch=4/micro_step=486/global_step=5270, RunningAvgSamplesPerSec=6.515918978551668, CurrSamplesPerSec=7.56945732180733, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:24:02,870] [INFO] [logging.py:96:log_dist] [Rank 0] step=5280, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:24:02,871] [INFO] [timer.py:259:stop] epoch=4/micro_step=496/global_step=5280, RunningAvgSamplesPerSec=6.5157870300186, CurrSamplesPerSec=6.406931573179791, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:24:54,056] [INFO] [logging.py:96:log_dist] [Rank 0] step=5290, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:24:54,057] [INFO] [timer.py:259:stop] epoch=4/micro_step=506/global_step=5290, RunningAvgSamplesPerSec=6.515869683184322, CurrSamplesPerSec=6.992267044805684, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:25:46,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=5300, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:25:46,593] [INFO] [timer.py:259:stop] epoch=4/micro_step=516/global_step=5300, RunningAvgSamplesPerSec=6.515689027643921, CurrSamplesPerSec=5.952999551513074, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 05:26:40,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=5310, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:26:40,510] [INFO] [timer.py:259:stop] epoch=4/micro_step=526/global_step=5310, RunningAvgSamplesPerSec=6.515061997286526, CurrSamplesPerSec=5.810932169960246, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:27:30,813] [INFO] [logging.py:96:log_dist] [Rank 0] step=5320, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:27:30,814] [INFO] [timer.py:259:stop] epoch=4/micro_step=536/global_step=5320, RunningAvgSamplesPerSec=6.515399212592488, CurrSamplesPerSec=6.453566506761464, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:28:20,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=5330, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:28:20,689] [INFO] [timer.py:259:stop] epoch=4/micro_step=546/global_step=5330, RunningAvgSamplesPerSec=6.515862777536235, CurrSamplesPerSec=6.8373311497251725, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:29:13,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=5340, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:29:13,832] [INFO] [timer.py:259:stop] epoch=4/micro_step=556/global_step=5340, RunningAvgSamplesPerSec=6.515413398002297, CurrSamplesPerSec=6.8371590899441745, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:30:06,630] [INFO] [logging.py:96:log_dist] [Rank 0] step=5350, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:30:06,631] [INFO] [timer.py:259:stop] epoch=4/micro_step=566/global_step=5350, RunningAvgSamplesPerSec=6.51519418716859, CurrSamplesPerSec=6.873484802358841, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:30:57,846] [INFO] [logging.py:96:log_dist] [Rank 0] step=5360, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:30:57,847] [INFO] [timer.py:259:stop] epoch=4/micro_step=576/global_step=5360, RunningAvgSamplesPerSec=6.51539278047715, CurrSamplesPerSec=6.0610702148315845, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:31:50,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=5370, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:31:50,393] [INFO] [timer.py:259:stop] epoch=4/micro_step=586/global_step=5370, RunningAvgSamplesPerSec=6.515097156639348, CurrSamplesPerSec=6.323153118598114, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 05:32:44,814] [INFO] [logging.py:96:log_dist] [Rank 0] step=5380, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:32:44,814] [INFO] [timer.py:259:stop] epoch=4/micro_step=596/global_step=5380, RunningAvgSamplesPerSec=6.514428615388676, CurrSamplesPerSec=5.991661924004071, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +Saving checkpoint at step 5382 ... +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-24 05:33:08,439] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step5382 is about to be saved! +[2024-10-24 05:33:08,439] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5382.bin, tag: global_step5382 +[2024-10-24 05:33:08,440] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5382.bin... +[2024-10-24 05:33:39,283] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5382.bin. +[2024-10-24 05:33:39,284] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step5382 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-24 05:33:58,547] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step5382 is about to be saved! +[2024-10-24 05:33:58,548] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5382.bin, tag: global_step5382 +[2024-10-24 05:33:58,548] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5382.bin... +[2024-10-24 05:34:29,678] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5382.bin. +[2024-10-24 05:34:29,679] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step5382 is ready now! +Model saved! +Checkpoint saved. +[2024-10-24 05:35:10,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=5390, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:35:10,002] [INFO] [timer.py:259:stop] epoch=4/micro_step=606/global_step=5390, RunningAvgSamplesPerSec=6.514941090226021, CurrSamplesPerSec=6.121596933336734, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 05:35:58,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=5400, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:35:58,730] [INFO] [timer.py:259:stop] epoch=4/micro_step=616/global_step=5400, RunningAvgSamplesPerSec=6.515616199907367, CurrSamplesPerSec=7.9835647398101015, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:36:50,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=5410, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:36:50,263] [INFO] [timer.py:259:stop] epoch=4/micro_step=626/global_step=5410, RunningAvgSamplesPerSec=6.515659400981985, CurrSamplesPerSec=6.570373639586046, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:37:43,159] [INFO] [logging.py:96:log_dist] [Rank 0] step=5420, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:37:43,159] [INFO] [timer.py:259:stop] epoch=4/micro_step=636/global_step=5420, RunningAvgSamplesPerSec=6.5153611591137945, CurrSamplesPerSec=7.108760078129864, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:38:35,166] [INFO] [logging.py:96:log_dist] [Rank 0] step=5430, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:38:35,167] [INFO] [timer.py:259:stop] epoch=4/micro_step=646/global_step=5430, RunningAvgSamplesPerSec=6.5152947615117816, CurrSamplesPerSec=6.304334614640318, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:39:27,234] [INFO] [logging.py:96:log_dist] [Rank 0] step=5440, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:39:27,235] [INFO] [timer.py:259:stop] epoch=4/micro_step=656/global_step=5440, RunningAvgSamplesPerSec=6.515198055191274, CurrSamplesPerSec=6.867083293098917, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:40:18,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=5450, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:40:18,590] [INFO] [timer.py:259:stop] epoch=4/micro_step=666/global_step=5450, RunningAvgSamplesPerSec=6.515227813168577, CurrSamplesPerSec=7.123738688422714, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:41:11,735] [INFO] [logging.py:96:log_dist] [Rank 0] step=5460, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:41:11,735] [INFO] [timer.py:259:stop] epoch=4/micro_step=676/global_step=5460, RunningAvgSamplesPerSec=6.514862826282264, CurrSamplesPerSec=6.310483924982851, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:42:05,510] [INFO] [logging.py:96:log_dist] [Rank 0] step=5470, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:42:05,511] [INFO] [timer.py:259:stop] epoch=4/micro_step=686/global_step=5470, RunningAvgSamplesPerSec=6.514341345323293, CurrSamplesPerSec=6.524779718461788, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:42:54,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=5480, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:42:54,882] [INFO] [timer.py:259:stop] epoch=4/micro_step=696/global_step=5480, RunningAvgSamplesPerSec=6.514892208682953, CurrSamplesPerSec=7.009257088575872, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:43:45,834] [INFO] [logging.py:96:log_dist] [Rank 0] step=5490, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:43:45,834] [INFO] [timer.py:259:stop] epoch=4/micro_step=706/global_step=5490, RunningAvgSamplesPerSec=6.515134653121007, CurrSamplesPerSec=7.102598547310192, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:44:37,101] [INFO] [logging.py:96:log_dist] [Rank 0] step=5500, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:44:37,102] [INFO] [timer.py:259:stop] epoch=4/micro_step=716/global_step=5500, RunningAvgSamplesPerSec=6.5152810911093715, CurrSamplesPerSec=6.593591939884538, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:45:26,233] [INFO] [logging.py:96:log_dist] [Rank 0] step=5510, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:45:26,234] [INFO] [timer.py:259:stop] epoch=4/micro_step=726/global_step=5510, RunningAvgSamplesPerSec=6.515893350733375, CurrSamplesPerSec=6.600141570446546, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:46:19,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=5520, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:46:19,768] [INFO] [timer.py:259:stop] epoch=4/micro_step=736/global_step=5520, RunningAvgSamplesPerSec=6.515419387866999, CurrSamplesPerSec=5.921470009286246, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:47:12,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=5530, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:47:12,714] [INFO] [timer.py:259:stop] epoch=4/micro_step=746/global_step=5530, RunningAvgSamplesPerSec=6.515059421242713, CurrSamplesPerSec=6.475412806925817, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:48:03,371] [INFO] [logging.py:96:log_dist] [Rank 0] step=5540, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:48:03,372] [INFO] [timer.py:259:stop] epoch=4/micro_step=756/global_step=5540, RunningAvgSamplesPerSec=6.515315413523875, CurrSamplesPerSec=6.782408527633573, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:48:56,234] [INFO] [logging.py:96:log_dist] [Rank 0] step=5550, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:48:56,235] [INFO] [timer.py:259:stop] epoch=4/micro_step=766/global_step=5550, RunningAvgSamplesPerSec=6.514985328869318, CurrSamplesPerSec=6.118984990409769, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 05:49:46,609] [INFO] [logging.py:96:log_dist] [Rank 0] step=5560, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:49:46,610] [INFO] [timer.py:259:stop] epoch=4/micro_step=776/global_step=5560, RunningAvgSamplesPerSec=6.5152777677274365, CurrSamplesPerSec=6.32199542792683, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 05:50:38,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=5570, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:50:38,535] [INFO] [timer.py:259:stop] epoch=4/micro_step=786/global_step=5570, RunningAvgSamplesPerSec=6.515240441196328, CurrSamplesPerSec=6.196018572678899, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:51:31,797] [INFO] [logging.py:96:log_dist] [Rank 0] step=5580, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:51:31,798] [INFO] [timer.py:259:stop] epoch=4/micro_step=796/global_step=5580, RunningAvgSamplesPerSec=6.51483001149672, CurrSamplesPerSec=6.4987017743216775, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 05:52:23,309] [INFO] [logging.py:96:log_dist] [Rank 0] step=5590, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:52:23,309] [INFO] [timer.py:259:stop] epoch=4/micro_step=806/global_step=5590, RunningAvgSamplesPerSec=6.5148954674718835, CurrSamplesPerSec=6.7258462982860205, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:53:15,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=5600, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:53:15,141] [INFO] [timer.py:259:stop] epoch=4/micro_step=816/global_step=5600, RunningAvgSamplesPerSec=6.514953779322104, CurrSamplesPerSec=7.335118882351874, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:54:06,227] [INFO] [logging.py:96:log_dist] [Rank 0] step=5610, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:54:06,227] [INFO] [timer.py:259:stop] epoch=4/micro_step=826/global_step=5610, RunningAvgSamplesPerSec=6.515134146952314, CurrSamplesPerSec=6.42871370002953, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 05:54:57,958] [INFO] [logging.py:96:log_dist] [Rank 0] step=5620, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:54:57,958] [INFO] [timer.py:259:stop] epoch=4/micro_step=836/global_step=5620, RunningAvgSamplesPerSec=6.515150795336363, CurrSamplesPerSec=6.283450532255495, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:55:52,526] [INFO] [logging.py:96:log_dist] [Rank 0] step=5630, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:55:52,527] [INFO] [timer.py:259:stop] epoch=4/micro_step=846/global_step=5630, RunningAvgSamplesPerSec=6.514477933575557, CurrSamplesPerSec=6.478399887343872, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 05:56:44,122] [INFO] [logging.py:96:log_dist] [Rank 0] step=5640, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:56:44,122] [INFO] [timer.py:259:stop] epoch=4/micro_step=856/global_step=5640, RunningAvgSamplesPerSec=6.514461362285895, CurrSamplesPerSec=6.34440075960419, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:57:35,098] [INFO] [logging.py:96:log_dist] [Rank 0] step=5650, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:57:35,099] [INFO] [timer.py:259:stop] epoch=4/micro_step=866/global_step=5650, RunningAvgSamplesPerSec=6.514677913552566, CurrSamplesPerSec=6.731508647920384, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:58:27,725] [INFO] [logging.py:96:log_dist] [Rank 0] step=5660, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:58:27,726] [INFO] [timer.py:259:stop] epoch=4/micro_step=876/global_step=5660, RunningAvgSamplesPerSec=6.514526618657933, CurrSamplesPerSec=7.267144844520678, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 05:59:19,639] [INFO] [logging.py:96:log_dist] [Rank 0] step=5670, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 05:59:19,639] [INFO] [timer.py:259:stop] epoch=4/micro_step=886/global_step=5670, RunningAvgSamplesPerSec=6.514475035603434, CurrSamplesPerSec=5.962027118290286, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:00:13,118] [INFO] [logging.py:96:log_dist] [Rank 0] step=5680, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:00:13,119] [INFO] [timer.py:259:stop] epoch=4/micro_step=896/global_step=5680, RunningAvgSamplesPerSec=6.514023489315402, CurrSamplesPerSec=7.046059267954566, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 06:01:03,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=5690, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:01:03,578] [INFO] [timer.py:259:stop] epoch=4/micro_step=906/global_step=5690, RunningAvgSamplesPerSec=6.51435720833978, CurrSamplesPerSec=6.763441998513356, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 06:01:55,758] [INFO] [logging.py:96:log_dist] [Rank 0] step=5700, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:01:55,758] [INFO] [timer.py:259:stop] epoch=4/micro_step=916/global_step=5700, RunningAvgSamplesPerSec=6.514231017049463, CurrSamplesPerSec=6.290890155769188, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:02:47,602] [INFO] [logging.py:96:log_dist] [Rank 0] step=5710, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:02:47,602] [INFO] [timer.py:259:stop] epoch=4/micro_step=926/global_step=5710, RunningAvgSamplesPerSec=6.5141865571253526, CurrSamplesPerSec=6.0529871661095696, MemAllocated=25.42GB, MaxMemAllocated=45.92GB +[2024-10-24 06:03:40,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=5720, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:03:40,142] [INFO] [timer.py:259:stop] epoch=4/micro_step=936/global_step=5720, RunningAvgSamplesPerSec=6.514087195291291, CurrSamplesPerSec=5.881175151489892, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:04:30,781] [INFO] [logging.py:96:log_dist] [Rank 0] step=5730, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:04:30,782] [INFO] [timer.py:259:stop] epoch=4/micro_step=946/global_step=5730, RunningAvgSamplesPerSec=6.514324696253298, CurrSamplesPerSec=6.690880935543525, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:05:22,430] [INFO] [logging.py:96:log_dist] [Rank 0] step=5740, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:05:22,431] [INFO] [timer.py:259:stop] epoch=4/micro_step=956/global_step=5740, RunningAvgSamplesPerSec=6.514340855325288, CurrSamplesPerSec=7.195233085484614, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 06:06:14,210] [INFO] [logging.py:96:log_dist] [Rank 0] step=5750, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:06:14,211] [INFO] [timer.py:259:stop] epoch=4/micro_step=966/global_step=5750, RunningAvgSamplesPerSec=6.514292229301095, CurrSamplesPerSec=7.05286642393016, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:07:05,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=5760, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:07:05,731] [INFO] [timer.py:259:stop] epoch=4/micro_step=976/global_step=5760, RunningAvgSamplesPerSec=6.514330406761347, CurrSamplesPerSec=6.1779282112585205, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:07:55,755] [INFO] [logging.py:96:log_dist] [Rank 0] step=5770, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:07:55,756] [INFO] [timer.py:259:stop] epoch=4/micro_step=986/global_step=5770, RunningAvgSamplesPerSec=6.5147282430892774, CurrSamplesPerSec=6.310676488227738, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 06:08:47,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=5780, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:08:47,070] [INFO] [timer.py:259:stop] epoch=4/micro_step=996/global_step=5780, RunningAvgSamplesPerSec=6.514916221228793, CurrSamplesPerSec=6.333578295426953, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:09:40,374] [INFO] [logging.py:96:log_dist] [Rank 0] step=5790, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:09:40,375] [INFO] [timer.py:259:stop] epoch=4/micro_step=1006/global_step=5790, RunningAvgSamplesPerSec=6.514588565724548, CurrSamplesPerSec=6.7166431696071385, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:10:32,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=5800, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:10:32,740] [INFO] [timer.py:259:stop] epoch=4/micro_step=1016/global_step=5800, RunningAvgSamplesPerSec=6.514432125956492, CurrSamplesPerSec=6.924202667787448, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 06:11:22,648] [INFO] [logging.py:96:log_dist] [Rank 0] step=5810, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:11:22,648] [INFO] [timer.py:259:stop] epoch=4/micro_step=1026/global_step=5810, RunningAvgSamplesPerSec=6.514757066812239, CurrSamplesPerSec=6.423941401854996, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:12:15,317] [INFO] [logging.py:96:log_dist] [Rank 0] step=5820, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:12:15,318] [INFO] [timer.py:259:stop] epoch=4/micro_step=1036/global_step=5820, RunningAvgSamplesPerSec=6.514613447929754, CurrSamplesPerSec=7.191369406987457, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:13:07,561] [INFO] [logging.py:96:log_dist] [Rank 0] step=5830, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:13:07,562] [INFO] [timer.py:259:stop] epoch=4/micro_step=1046/global_step=5830, RunningAvgSamplesPerSec=6.514501009156554, CurrSamplesPerSec=6.590322644008081, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:13:59,331] [INFO] [logging.py:96:log_dist] [Rank 0] step=5840, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:13:59,333] [INFO] [timer.py:259:stop] epoch=4/micro_step=1056/global_step=5840, RunningAvgSamplesPerSec=6.5145456120163665, CurrSamplesPerSec=6.490589247300017, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 06:14:52,695] [INFO] [logging.py:96:log_dist] [Rank 0] step=5850, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:14:52,695] [INFO] [timer.py:259:stop] epoch=4/micro_step=1066/global_step=5850, RunningAvgSamplesPerSec=6.514130220260349, CurrSamplesPerSec=5.941868875823422, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:15:43,299] [INFO] [logging.py:96:log_dist] [Rank 0] step=5860, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:15:43,300] [INFO] [timer.py:259:stop] epoch=4/micro_step=1076/global_step=5860, RunningAvgSamplesPerSec=6.514324539864927, CurrSamplesPerSec=6.034968917377457, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 06:16:34,118] [INFO] [logging.py:96:log_dist] [Rank 0] step=5870, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:16:34,118] [INFO] [timer.py:259:stop] epoch=4/micro_step=1086/global_step=5870, RunningAvgSamplesPerSec=6.5145234248958115, CurrSamplesPerSec=6.457999473284729, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:17:27,032] [INFO] [logging.py:96:log_dist] [Rank 0] step=5880, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:17:27,033] [INFO] [timer.py:259:stop] epoch=4/micro_step=1096/global_step=5880, RunningAvgSamplesPerSec=6.514257599131744, CurrSamplesPerSec=6.5586620848242445, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:18:20,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=5890, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:18:20,065] [INFO] [timer.py:259:stop] epoch=4/micro_step=1106/global_step=5890, RunningAvgSamplesPerSec=6.514039468822483, CurrSamplesPerSec=6.808056360683129, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:19:09,837] [INFO] [logging.py:96:log_dist] [Rank 0] step=5900, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:19:09,838] [INFO] [timer.py:259:stop] epoch=4/micro_step=1116/global_step=5900, RunningAvgSamplesPerSec=6.514440422946004, CurrSamplesPerSec=6.758947180048648, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:20:00,712] [INFO] [logging.py:96:log_dist] [Rank 0] step=5910, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:20:00,713] [INFO] [timer.py:259:stop] epoch=4/micro_step=1126/global_step=5910, RunningAvgSamplesPerSec=6.514608818945793, CurrSamplesPerSec=6.653348510113354, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:20:53,289] [INFO] [logging.py:96:log_dist] [Rank 0] step=5920, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:20:53,289] [INFO] [timer.py:259:stop] epoch=4/micro_step=1136/global_step=5920, RunningAvgSamplesPerSec=6.514485146213983, CurrSamplesPerSec=6.735595868004112, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 06:21:43,066] [INFO] [logging.py:96:log_dist] [Rank 0] step=5930, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:21:43,067] [INFO] [timer.py:259:stop] epoch=4/micro_step=1146/global_step=5930, RunningAvgSamplesPerSec=6.514913208647232, CurrSamplesPerSec=6.445150834915264, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:22:33,481] [INFO] [logging.py:96:log_dist] [Rank 0] step=5940, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:22:33,481] [INFO] [timer.py:259:stop] epoch=4/micro_step=1156/global_step=5940, RunningAvgSamplesPerSec=6.5151520233402955, CurrSamplesPerSec=7.068227648319179, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 06:23:24,246] [INFO] [logging.py:96:log_dist] [Rank 0] step=5950, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:23:24,246] [INFO] [timer.py:259:stop] epoch=4/micro_step=1166/global_step=5950, RunningAvgSamplesPerSec=6.515325602161392, CurrSamplesPerSec=6.9862795329793395, MemAllocated=25.4GB, MaxMemAllocated=45.92GB +[2024-10-24 06:24:15,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=5960, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:24:15,014] [INFO] [timer.py:259:stop] epoch=4/micro_step=1176/global_step=5960, RunningAvgSamplesPerSec=6.515525213244012, CurrSamplesPerSec=7.032829209313797, MemAllocated=25.39GB, MaxMemAllocated=45.92GB +[2024-10-24 06:25:07,669] [INFO] [logging.py:96:log_dist] [Rank 0] step=5970, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:25:07,670] [INFO] [timer.py:259:stop] epoch=4/micro_step=1186/global_step=5970, RunningAvgSamplesPerSec=6.515305957013375, CurrSamplesPerSec=6.385023472535628, MemAllocated=25.41GB, MaxMemAllocated=45.92GB +[2024-10-24 06:25:58,009] [INFO] [logging.py:96:log_dist] [Rank 0] step=5980, skipped=0, lr=[3e-05, 3e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2024-10-24 06:25:58,010] [INFO] [timer.py:259:stop] epoch=4/micro_step=1196/global_step=5980, RunningAvgSamplesPerSec=6.515615520798199, CurrSamplesPerSec=19.950173385078248, MemAllocated=25.34GB, MaxMemAllocated=45.92GB +Saving checkpoint at step 5980 ... +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-24 06:26:15,162] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step5980 is about to be saved! +[2024-10-24 06:26:15,163] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5980.bin, tag: global_step5980 +[2024-10-24 06:26:15,163] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5980.bin... +[2024-10-24 06:26:52,526] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5980.bin. +[2024-10-24 06:26:52,527] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step5980 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-24 06:27:11,942] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step5980 is about to be saved! +[2024-10-24 06:27:11,943] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5980.bin, tag: global_step5980 +[2024-10-24 06:27:11,943] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5980.bin... +[2024-10-24 06:27:52,113] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model_5980.bin. +[2024-10-24 06:27:52,114] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step5980 is ready now! +Model saved! +Checkpoint saved. + +***** Evaluating at epoch 5/5.0 ***** + +***** Evaluating at the beginning ***** +Evaluation: accuracy = 0.670096, reward_mean = 6.753209, reward_std = 3.374934 +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ prompt  ┃ higher-reward response  ┃ reward  ┃ lower-reward response  ┃ reward  ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ USER: │ There are two bags visible in the image. │ 8.812500 │ There are four bags visible in the image. │ 8.625000 │ +│ How many bags are visible in the image? │ │ │ │ │ +│ │ │ │ │ │ +├────────────────────────────────────────────────┼────────────────────────────────────────────────┼──────────┼────────────────────────────────────────────────┼──────────┤ +│ USER: │ Based on the signs and advertisements visible │ 7.875000 │ Based on the presence of multiple signs, │ 1.179688 │ +│ What can you infer about the purpose of the │ in the image, it can be inferred that the │ │ including a large red sign that reads "Sale," │ │ +│ building with the pyramid-shaped structure │ building with the pyramid-shaped structure is │ │ it is likely that this building serves as an │ │ +│ based on the signs and advertisements? │ likely a shopping center or a commercial │ │ outlet for retail or commercial purposes. The │ │ +│ │ complex. The signs include names of various │ │ pyramid shape suggests that there might be │ │ +│ │ brands and stores, such as "Ochsner," "Manor," │ │ some architectural significance to the design, │ │ +│ │ and "Traita fina," which are typically │ │ but without more context, we cannot │ │ +│ │ associated with retail and shopping. │ │ definitively determine its specific function. │ │ +│ │ Additionally, the large text "Shopping Center │ │ However, given the promotional nature of the │ │ +│ │ St. Jakob-Park" further supports the idea that │ │ signs, one could reasonably assume that the │ │ +│ │ this building is a shopping center. The │ │ building houses shops or businesses offering │ │ +│ │ prominent display of the word "BASEL" suggests │ │ discounts or special deals to attract │ │ +│ │ that this shopping center is located in or │ │ customers. │ │ +│ │ near Basel, Switzerland. │ │ │ │ +├────────────────────────────────────────────────┼────────────────────────────────────────────────┼──────────┼────────────────────────────────────────────────┼──────────┤ +│ USER: │ Based on the lighting and shadows in the │ 8.312500 │ The image shows a plane at an airport terminal │ 4.656250 │ +│ Based on the lighting and shadows, can you │ image, it appears that the sun is setting. The │ │ with lights in the distance. The sky has hues │ │ +│ infer if the sun is rising or setting? │ sky has a gradient from a darker blue at the │ │ of yellow and orange near the horizon which │ │ +│ │ top to a lighter yellow-orange near the │ │ suggests that it's either sunrise or sunset. │ │ +│ │ horizon, which is typical of sunset. │ │ However, there are no direct sunlight rays │ │ +│ │ Additionally, the lights at the airport are │ │ visible, so we cannot determine whether this │ │ +│ │ on, which is more common during the evening as │ │ photo was taken during dawn (sunrise) or dusk │ │ +│ │ it gets darker. │ │ (sunset). It could be either one depending on │ │ +│ │ │ │ where the photographer took the picture from. │ │ +│ │ │ │ If they were standing facing east when taking │ │ +│ │ │ │ the shot, then it would likely be morning; but │ │ +│ │ │ │ if they faced west while capturing the scene, │ │ +│ │ │ │ it might have been evening instead. │ │ +│ │ │ │ │ │ +│ │ │ │ It appears to be early morning based off how │ │ +│ │ │ │ dark everything else looks except for those │ │ +│ │ │ │ areas around the horizon line indicating light │ │ +│ │ │ │ coming up over mountains behind them - │ │ +│ │ │ │ suggesting daytime hours ahead rather than │ │ +│ │ │ │ nighttime ones since all other parts remain │ │ +│ │ │ │ shadowed by darkness still lingering outside │ │ +│ │ │ │ city limits below us here today... │ │ +└────────────────────────────────────────────────┴────────────────────────────────────────────────┴──────────┴────────────────────────────────────────────────┴──────────┘ +Saving model to "/data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000" ... +Saving 16-bit model... +[2024-10-24 06:48:49,472] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step5980 is about to be saved! +[2024-10-24 06:48:49,473] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model.bin, tag: global_step5980 +[2024-10-24 06:48:49,473] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model.bin... +[2024-10-24 06:49:27,466] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model.bin. +[2024-10-24 06:49:27,469] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step5980 is ready now! +Model saved! +Saving 16-bit model... +[2024-10-24 06:49:46,892] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step5980 is about to be saved! +[2024-10-24 06:49:46,893] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model.bin, tag: global_step5980 +[2024-10-24 06:49:46,893] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model.bin... +[2024-10-24 06:50:25,487] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/jiayi/ti2t/rm_10_22_ours_13b_2000/pytorch_model.bin. +[2024-10-24 06:50:25,491] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step5980 is ready now! +Model saved!