training config differences (same dataset)
Collection
3 items
โข
Updated
current batches:
nv3[v0] (1700) | nv4[v1-2k] (4000) | nv4[v1-210k] (b1b2: 4000)
metrics:
636 ***** train metrics *****
1637 epoch = 20.0
1638 total_flos = 66966619017GF
1639 train_loss = 0.2338
1640 train_runtime = 0:58:49.65
1641 train_samples_per_second = 56.736
1642 train_steps_per_second = 0.89
1644 ***** eval metrics *****
1645 epoch = 20.0
1646 eval_accuracy = 0.7521
1647 eval_loss = 0.8814
1648 eval_runtime = 0:00:12.42
1649 eval_samples_per_second = 142.171
1650 eval_steps_per_second = 2.977
# since ordinal on pretrained anime is really bad, let's try doing it on pretrained as classifier instead:
BASE_MODEL = "facebook/dinov2-with-registers-large"
DATASET = "distill-lab/COMBINE_nai-distill_00-01_eagle.library"
TASK = "classification"
# using single card to train it, so had to do higher batch size
cmd = f"""python -m trainlib.hf_trainer.cli \
--model_name_or_path {BASE_MODEL} \
--dataset_name {DATASET} \
--output_dir distill-n4_00-01_combined_cls_v1b2-100e \
--remove_unused_columns False \
--label_column_name star \
--task {TASK} \
--do_train \
--do_eval \
--eval_strategy steps \
--eval_steps 100 \
--learning_rate 1e-5 \
--num_train_epochs 20 \
--per_device_train_batch_size 64 \
--per_device_eval_batch_size 48 \
--logging_strategy steps \
--logging_steps 2 \
--save_total_limit 1 \
--seed 1337 \
--lr_scheduler_type cosine \
--dataloader_num_workers 16 \
--ignore_mismatched_sizes True
"""
rest = f"""
--push_to_hub: True \
--push_to_hub_organization distill-lab \
--hub_model_id nai-distill_00-01_combined_eagle_{TASK} \
--hub_strategy "end"""
print(cmd)
!{cmd}