Spaces:

AvocadoMuffin
/

roberta_cuad_trainer

Sleeping

App Files Files Community

AvocadoMuffin commited on 19 days ago

Commit

a93a747

verified ·

1 Parent(s): 4228969

Update train.py

Browse files

Files changed (1) hide show

train.py +28 -9

train.py CHANGED Viewed

@@ -226,7 +226,7 @@ def main():
         print(f"✅ Loaded {len(cuad)} examples")
     except Exception as e:
         print(f"❌ Dataset loading failed: {e}")
-        cuad = load_dataset("theatricusproject/cuad-qa", split="train", trust_remote_code=True, download_mode="force_redownload")
     cuad = cuad.shuffle(seed=SEED)
@@ -269,21 +269,40 @@ def main():
     print(f"   Training features: {len(train_feats)}")
     print(f"   Validation features: {len(val_feats)}")
-    # ── training args ──────────────────────────────────────────────────
-    # Adjusted for smaller dataset
-    total_steps = (len(train_feats) // 16 // 2) * 6  # Rough estimate
     args = TrainingArguments(
         output_dir="./cuad_lora_out",
         learning_rate=5e-5,  # Slightly higher for smaller dataset
-        num_train_epochs=6 if USE_SUBSET else 4,  # More epochs for smaller dataset
-        per_device_train_batch_size=16,
         per_device_eval_batch_size=16,
-        gradient_accumulation_steps=2,
         fp16=False, bf16=True,
         eval_strategy="steps",
-        eval_steps=max(100, total_steps // 20),  # Adaptive eval steps
-        save_steps=max(200, total_steps // 10),  # Adaptive save steps
         save_total_limit=2,
         weight_decay=0.01,
         lr_scheduler_type="cosine",

         print(f"✅ Loaded {len(cuad)} examples")
     except Exception as e:
         print(f"❌ Dataset loading failed: {e}")
+        cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True, download_mode="force_redownload")
     cuad = cuad.shuffle(seed=SEED)
     print(f"   Training features: {len(train_feats)}")
     print(f"   Validation features: {len(val_feats)}")
+    # ── training args with fixed eval/save step alignment ──────────────────
+    # Calculate proper steps that align
+    batch_size = 16
+    gradient_accumulation_steps = 2
+    effective_batch_size = batch_size * gradient_accumulation_steps
+    # Calculate total training steps
+    num_epochs = 6 if USE_SUBSET else 4
+    steps_per_epoch = len(train_feats) // effective_batch_size
+    total_steps = steps_per_epoch * num_epochs
+    # Set eval steps first
+    eval_steps = max(50, steps_per_epoch // 4)  # Evaluate 4 times per epoch
+    # Set save steps as a multiple of eval steps
+    save_steps = eval_steps * 2  # Save every 2 evaluations
+    print(f"📊 Training configuration:")
+    print(f"   Steps per epoch: {steps_per_epoch}")
+    print(f"   Total steps: {total_steps}")
+    print(f"   Eval steps: {eval_steps}")
+    print(f"   Save steps: {save_steps}")
     args = TrainingArguments(
         output_dir="./cuad_lora_out",
         learning_rate=5e-5,  # Slightly higher for smaller dataset
+        num_train_epochs=num_epochs,
+        per_device_train_batch_size=batch_size,
         per_device_eval_batch_size=16,
+        gradient_accumulation_steps=gradient_accumulation_steps,
         fp16=False, bf16=True,
         eval_strategy="steps",
+        eval_steps=eval_steps,
+        save_steps=save_steps,
         save_total_limit=2,
         weight_decay=0.01,
         lr_scheduler_type="cosine",