Spaces:

AvocadoMuffin
/

roberta_cuad_trainer

Sleeping

App Files Files Community

AvocadoMuffin commited on 25 days ago

Commit

22050d0

verified ·

1 Parent(s): 5b5e488

Update train.py

Browse files

Files changed (1) hide show

train.py +168 -82

train.py CHANGED Viewed

@@ -1,12 +1,14 @@
 #!/usr/bin/env python
-# train_cuad_lora.py
 """
 CUAD fine-tune with LoRA on an L4 / T4 GPU.
 Expected wall-clock on Nvidia L4: ~25-30 min.
 """
-import os, json, random, gc
 from collections import defaultdict
 import torch, numpy as np
 from datasets import load_dataset, Dataset, disable_caching
@@ -14,7 +16,6 @@ from transformers import (
     AutoTokenizer, AutoModelForQuestionAnswering,
     TrainingArguments, default_data_collator, Trainer
 )
-# FIXED: Use regular Trainer instead of QuestionAnsweringTrainer
 from peft import LoraConfig, get_peft_model, TaskType
 import evaluate
 from huggingface_hub import login
@@ -26,11 +27,25 @@ disable_caching()                 # avoids giant disk cache on Colab
 MAX_LEN     = 384                 # window
 DOC_STRIDE  = 128
 SEED        = 42
 def set_seed(seed):
     random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
 def balance_has_answer(dataset, ratio=2.0):
     """Keep all has-answer rows, down-sample no-answer rows to `ratio`."""
     has, no = [], []
@@ -81,7 +96,7 @@ def postprocess_qa(examples, features, raw_predictions, tokenizer):
     return predictions
 def compute_metrics(eval_pred):
-    """FIXED: Use regular eval_pred structure and correct variable names"""
     predictions = postprocess_qa(val_raw, val_feats, eval_pred.predictions, tok)
     references  = [
         {"id": ex["id"], "answers": ex["answers"]} for ex in val_raw
@@ -90,55 +105,9 @@ def compute_metrics(eval_pred):
 # ───────────────────────────────────────────────────────────────── main ──
-def main():
-    global val_raw, val_feats, tok  # FIXED: Use correct variable names
-    set_seed(SEED)
-    #  model name to store on Hub
-    model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v2")
-    if (tokn := os.getenv("roberta_token")):
-        try:
-            login(tokn)
-            print("🔑 HuggingFace Hub login OK")
-        except Exception as e:
-            print(f"⚠️ Hub login failed: {e}")
-            print("📝 Training will continue but won't push to Hub")
-            tokn = None  # Disable pushing
-    print("📚 Loading CUAD…")
-    try:
-        cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True)
-        print(f"✅ Loaded {len(cuad)} examples")
-    except Exception as e:
-        print(f"❌ Dataset loading failed: {e}")
-        print("🔄 Retrying with cache disabled...")
-        cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True, download_mode="force_redownload")
-    cuad = cuad.shuffle(seed=SEED)
-    cuad = balance_has_answer(cuad, ratio=2.0)        #  ≈18 k rows
-    print(f"📊 Balanced dataset: {len(cuad)} examples")
-    # train / val 90-10
-    ds = cuad.train_test_split(test_size=0.1, seed=SEED)
-    train_raw, val_raw = ds["train"], ds["test"]
-    # ── tokeniser & model (SQuAD-2 tuned) ───────────────────────────────
-    base_ckpt = "deepset/roberta-base-squad2"
-    tok   = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
-    model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
-    # LoRA
-    lora = LoraConfig(
-        task_type=TaskType.QUESTION_ANS,
-        r=16, lora_alpha=32, lora_dropout=0.05,
-        target_modules=["query", "value"],
-    )
-    model = get_peft_model(model, lora)
-    model.print_trainable_parameters()
-    # ── preprocess (OPTIMIZED) ─────────────────────────────────────────
     def preprocess(examples):
         # Tokenize all at once
         tokenized = tok(
@@ -204,29 +173,115 @@ def main():
         tokenized["example_id"] = example_ids
         return tokenized
-    print("🔄 Preprocessing training data...")
-    train_feats = train_raw.map(
-        preprocess,
-        batched=True,
-        remove_columns=train_raw.column_names,
-        num_proc=4,  # Use multiple processes for speed
-        desc="tokenise-train",
-        load_from_cache_file=False,
-        batch_size=100  # Process in smaller batches
     )
     # Remove offset_mapping from training data (not needed during training)
-    train_feats = train_feats.remove_columns(["offset_mapping"])
-    print("🔄 Preprocessing validation data...")
-    val_feats = val_raw.map(
-        preprocess,
-        batched=True,
-        remove_columns=val_raw.column_names,
-        num_proc=4,  # Use multiple processes for speed
-        desc="tokenise-val",
-        load_from_cache_file=False,
-        batch_size=100  # Process in smaller batches
-    )
     # Keep offset_mapping for validation (needed for postprocessing)
     # ── training args ──────────────────────────────────────────────────
@@ -250,9 +305,13 @@ def main():
         greater_is_better=True,
         logging_steps=50,
         report_to="none",
     )
-    # FIXED: Use regular Trainer instead of QuestionAnsweringTrainer
     trainer = Trainer(
         model=model,
         args=args,
@@ -264,7 +323,19 @@ def main():
     )
     print("🚀 Training…")
-    trainer.train()
     print("✅ Done.  Best F1:", trainer.state.best_metric)
     trainer.save_model("./cuad_lora_out")
@@ -272,16 +343,31 @@ def main():
     # optional: push (with retry logic)
     if tokn:
-        try:
-            print("⬆️ Pushing to Hub...")
-            trainer.push_to_hub(model_repo, private=False)
-            tok.push_to_hub(model_repo, private=False)
-            print("🚀 Pushed to:", f"https://huggingface.co/{model_repo}")
-        except Exception as e:
-            print(f"⚠️ Hub push failed: {e}")
-            print("💾 Model saved locally in ./cuad_lora_out")
     else:
         print("💾 Model saved locally in ./cuad_lora_out (no HF token for push)")
 if __name__ == "__main__":
     main()

 #!/usr/bin/env python
+# train_cuad_lora_improved.py
 """
 CUAD fine-tune with LoRA on an L4 / T4 GPU.
+Improved version with better error handling and recovery mechanisms.
 Expected wall-clock on Nvidia L4: ~25-30 min.
 """
+import os, json, random, gc, time
 from collections import defaultdict
+from pathlib import Path
 import torch, numpy as np
 from datasets import load_dataset, Dataset, disable_caching
     AutoTokenizer, AutoModelForQuestionAnswering,
     TrainingArguments, default_data_collator, Trainer
 )
 from peft import LoraConfig, get_peft_model, TaskType
 import evaluate
 from huggingface_hub import login
 MAX_LEN     = 384                 # window
 DOC_STRIDE  = 128
 SEED        = 42
+CHECKPOINT_DIR = "./cuad_lora_checkpoints"
 def set_seed(seed):
     random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
+def save_checkpoint(data, checkpoint_path):
+    """Save preprocessing checkpoint to disk"""
+    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
+    torch.save(data, checkpoint_path)
+    print(f"💾 Checkpoint saved: {checkpoint_path}")
+def load_checkpoint(checkpoint_path):
+    """Load preprocessing checkpoint from disk"""
+    if os.path.exists(checkpoint_path):
+        print(f"📂 Loading checkpoint: {checkpoint_path}")
+        return torch.load(checkpoint_path)
+    return None
 def balance_has_answer(dataset, ratio=2.0):
     """Keep all has-answer rows, down-sample no-answer rows to `ratio`."""
     has, no = [], []
     return predictions
 def compute_metrics(eval_pred):
+    """Use regular eval_pred structure and correct variable names"""
     predictions = postprocess_qa(val_raw, val_feats, eval_pred.predictions, tok)
     references  = [
         {"id": ex["id"], "answers": ex["answers"]} for ex in val_raw
 # ───────────────────────────────────────────────────────────────── main ──
+def preprocess_with_retry(dataset, dataset_name, max_retries=3):
+    """Preprocess dataset with retry logic and checkpointing"""
     def preprocess(examples):
         # Tokenize all at once
         tokenized = tok(
         tokenized["example_id"] = example_ids
         return tokenized
+    checkpoint_path = f"{CHECKPOINT_DIR}/{dataset_name}_features.pt"
+    # Try to load from checkpoint first
+    features = load_checkpoint(checkpoint_path)
+    if features is not None:
+        print(f"✅ Loaded {dataset_name} features from checkpoint")
+        return Dataset.from_dict(features)
+    # Process with retries
+    for attempt in range(max_retries):
+        try:
+            print(f"🔄 Preprocessing {dataset_name} data (attempt {attempt + 1}/{max_retries})...")
+            # Use smaller batch sizes and reduce num_proc for stability
+            features = dataset.map(
+                preprocess,
+                batched=True,
+                remove_columns=dataset.column_names,
+                num_proc=2,  # Reduced from 4 for stability
+                desc=f"tokenise-{dataset_name}",
+                load_from_cache_file=False,
+                batch_size=50,  # Reduced from 100 for stability
+                writer_batch_size=50  # Add writer batch size limit
+            )
+            # Save checkpoint after successful processing
+            save_checkpoint(features.to_dict(), checkpoint_path)
+            return features
+        except Exception as e:
+            print(f"❌ Preprocessing failed on attempt {attempt + 1}: {e}")
+            if attempt < max_retries - 1:
+                print(f"⏳ Waiting 10 seconds before retry...")
+                time.sleep(10)
+                gc.collect()  # Clean up memory
+            else:
+                print("💥 All preprocessing attempts failed!")
+                raise e
+def main():
+    global val_raw, val_feats, tok
+    set_seed(SEED)
+    # Create checkpoint directory
+    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+    # Model name to store on Hub
+    model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v2")
+    if (tokn := os.getenv("roberta_token")):
+        try:
+            login(tokn)
+            print("🔑 HuggingFace Hub login OK")
+        except Exception as e:
+            print(f"⚠️ Hub login failed: {e}")
+            print("📝 Training will continue but won't push to Hub")
+            tokn = None  # Disable pushing
+    print("📚 Loading CUAD…")
+    dataset_checkpoint = f"{CHECKPOINT_DIR}/cuad_dataset.pt"
+    # Try to load dataset from checkpoint
+    dataset_data = load_checkpoint(dataset_checkpoint)
+    if dataset_data is not None:
+        cuad = Dataset.from_dict(dataset_data)
+        print(f"✅ Loaded dataset from checkpoint: {len(cuad)} examples")
+    else:
+        # Load and process dataset
+        try:
+            cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True)
+            print(f"✅ Loaded {len(cuad)} examples")
+        except Exception as e:
+            print(f"❌ Dataset loading failed: {e}")
+            print("🔄 Retrying with cache disabled...")
+            cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True, download_mode="force_redownload")
+        cuad = cuad.shuffle(seed=SEED)
+        cuad = balance_has_answer(cuad, ratio=2.0)        #  ≈18 k rows
+        print(f"📊 Balanced dataset: {len(cuad)} examples")
+        # Save dataset checkpoint
+        save_checkpoint(cuad.to_dict(), dataset_checkpoint)
+    # train / val 90-10
+    ds = cuad.train_test_split(test_size=0.1, seed=SEED)
+    train_raw, val_raw = ds["train"], ds["test"]
+    # ── tokeniser & model (SQuAD-2 tuned) ───────────────────────────────
+    base_ckpt = "deepset/roberta-base-squad2"
+    tok   = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
+    model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
+    # LoRA
+    lora = LoraConfig(
+        task_type=TaskType.QUESTION_ANS,
+        r=16, lora_alpha=32, lora_dropout=0.05,
+        target_modules=["query", "value"],
     )
+    model = get_peft_model(model, lora)
+    model.print_trainable_parameters()
+    # ── preprocess with retry logic ─────────────────────────────────────────
+    train_feats = preprocess_with_retry(train_raw, "train")
     # Remove offset_mapping from training data (not needed during training)
+    if "offset_mapping" in train_feats.column_names:
+        train_feats = train_feats.remove_columns(["offset_mapping"])
+    val_feats = preprocess_with_retry(val_raw, "val")
     # Keep offset_mapping for validation (needed for postprocessing)
     # ── training args ──────────────────────────────────────────────────
         greater_is_better=True,
         logging_steps=50,
         report_to="none",
+        # Add resume from checkpoint capability
+        resume_from_checkpoint=True,
+        # Add dataloader settings for stability
+        dataloader_num_workers=0,  # Disable multiprocessing for data loading
+        dataloader_pin_memory=False,  # Reduce memory pressure
     )
     trainer = Trainer(
         model=model,
         args=args,
     )
     print("🚀 Training…")
+    try:
+        trainer.train()
+        print("✅ Training completed successfully!")
+    except Exception as e:
+        print(f"❌ Training failed: {e}")
+        print("💾 Attempting to save current state...")
+        try:
+            trainer.save_model("./cuad_lora_out_partial")
+            tok.save_pretrained("./cuad_lora_out_partial")
+            print("💾 Partial model saved to ./cuad_lora_out_partial")
+        except:
+            print("❌ Could not save partial model")
+        raise e
     print("✅ Done.  Best F1:", trainer.state.best_metric)
     trainer.save_model("./cuad_lora_out")
     # optional: push (with retry logic)
     if tokn:
+        max_push_retries = 3
+        for push_attempt in range(max_push_retries):
+            try:
+                print(f"⬆️ Pushing to Hub (attempt {push_attempt + 1}/{max_push_retries})...")
+                trainer.push_to_hub(model_repo, private=False)
+                tok.push_to_hub(model_repo, private=False)
+                print("🚀 Pushed to:", f"https://huggingface.co/{model_repo}")
+                break
+            except Exception as e:
+                print(f"⚠️ Hub push failed on attempt {push_attempt + 1}: {e}")
+                if push_attempt < max_push_retries - 1:
+                    print("⏳ Waiting 30 seconds before retry...")
+                    time.sleep(30)
+                else:
+                    print("💾 Model saved locally in ./cuad_lora_out (push failed)")
     else:
         print("💾 Model saved locally in ./cuad_lora_out (no HF token for push)")
+    # Clean up checkpoints after successful completion
+    try:
+        import shutil
+        shutil.rmtree(CHECKPOINT_DIR)
+        print("🧹 Cleaned up temporary checkpoints")
+    except:
+        print("⚠️ Could not clean up temporary checkpoints")
 if __name__ == "__main__":
     main()