Spaces:

AvocadoMuffin
/

roberta_cuad_trainer

Sleeping

App Files Files Community

AvocadoMuffin commited on 19 days ago

Commit

c48cc67

verified ·

1 Parent(s): 22050d0

Update train.py

Browse files

Files changed (1) hide show

train.py +217 -137

train.py CHANGED Viewed

@@ -1,9 +1,8 @@
 #!/usr/bin/env python
 # train_cuad_lora_improved.py
 """
-CUAD fine-tune with LoRA on an L4 / T4 GPU.
-Improved version with better error handling and recovery mechanisms.
-Expected wall-clock on Nvidia L4: ~25-30 min.
 """
 import os, json, random, gc, time
@@ -20,14 +19,15 @@ from peft import LoraConfig, get_peft_model, TaskType
 import evaluate
 from huggingface_hub import login
-disable_caching()                 # avoids giant disk cache on Colab
 # ─────────────────────────────────────────────────────────────── helpers ──
-MAX_LEN     = 384                 # window
 DOC_STRIDE  = 128
 SEED        = 42
 CHECKPOINT_DIR = "./cuad_lora_checkpoints"
 def set_seed(seed):
     random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
@@ -43,9 +43,42 @@ def load_checkpoint(checkpoint_path):
     """Load preprocessing checkpoint from disk"""
     if os.path.exists(checkpoint_path):
         print(f"📂 Loading checkpoint: {checkpoint_path}")
-        return torch.load(checkpoint_path)
     return None
 def balance_has_answer(dataset, ratio=2.0):
     """Keep all has-answer rows, down-sample no-answer rows to `ratio`."""
     has, no = [], []
@@ -105,112 +138,163 @@ def compute_metrics(eval_pred):
 # ───────────────────────────────────────────────────────────────── main ──
-def preprocess_with_retry(dataset, dataset_name, max_retries=3):
-    """Preprocess dataset with retry logic and checkpointing"""
-    def preprocess(examples):
-        # Tokenize all at once
-        tokenized = tok(
-            examples["question"],
-            examples["context"],
-            truncation="only_second",
-            max_length=MAX_LEN,
-            stride=DOC_STRIDE,
-            return_overflowing_tokens=True,
-            return_offsets_mapping=True,
-            padding="max_length",
-        )
-        sample_mapping = tokenized.pop("overflow_to_sample_mapping")
-        offset_mapping = tokenized["offset_mapping"]
-        # Vectorized processing
-        start_positions = []
-        end_positions = []
-        example_ids = []
-        for i in range(len(tokenized["input_ids"])):
-            sample_idx = sample_mapping[i]
-            answers = examples["answers"][sample_idx]
-            offsets = offset_mapping[i]
-            # Find CLS token position (always 0 for RoBERTa)
-            cls_index = 0
-            example_ids.append(examples["id"][sample_idx])
-            # No answer case
-            if not answers["text"] or not answers["text"][0]:
-                start_positions.append(cls_index)
-                end_positions.append(cls_index)
-                continue
-            # Get answer span
-            answer_start = answers["answer_start"][0]
-            answer_text = answers["text"][0]
-            answer_end = answer_start + len(answer_text)
-            # Find token positions
-            start_token = end_token = cls_index
-            for tok_idx, (start_char, end_char) in enumerate(offsets):
-                if start_char <= answer_start < end_char:
-                    start_token = tok_idx
-                if start_char < answer_end <= end_char:
-                    end_token = tok_idx
-                    break
-            # Ensure valid span
-            if start_token <= end_token and start_token > 0:
-                start_positions.append(start_token)
-                end_positions.append(end_token)
-            else:
-                start_positions.append(cls_index)
-                end_positions.append(cls_index)
-        tokenized["start_positions"] = start_positions
-        tokenized["end_positions"] = end_positions
-        tokenized["example_id"] = example_ids
-        return tokenized
-    checkpoint_path = f"{CHECKPOINT_DIR}/{dataset_name}_features.pt"
-    # Try to load from checkpoint first
-    features = load_checkpoint(checkpoint_path)
-    if features is not None:
-        print(f"✅ Loaded {dataset_name} features from checkpoint")
-        return Dataset.from_dict(features)
-    # Process with retries
-    for attempt in range(max_retries):
-        try:
-            print(f"🔄 Preprocessing {dataset_name} data (attempt {attempt + 1}/{max_retries})...")
-            # Use smaller batch sizes and reduce num_proc for stability
-            features = dataset.map(
-                preprocess,
-                batched=True,
-                remove_columns=dataset.column_names,
-                num_proc=2,  # Reduced from 4 for stability
-                desc=f"tokenise-{dataset_name}",
-                load_from_cache_file=False,
-                batch_size=50,  # Reduced from 100 for stability
-                writer_batch_size=50  # Add writer batch size limit
-            )
-            # Save checkpoint after successful processing
-            save_checkpoint(features.to_dict(), checkpoint_path)
-            return features
-        except Exception as e:
-            print(f"❌ Preprocessing failed on attempt {attempt + 1}: {e}")
-            if attempt < max_retries - 1:
-                print(f"⏳ Waiting 10 seconds before retry...")
-                time.sleep(10)
-                gc.collect()  # Clean up memory
-            else:
-                print("💥 All preprocessing attempts failed!")
-                raise e
 def main():
     global val_raw, val_feats, tok
@@ -229,8 +313,7 @@ def main():
             print("🔑 HuggingFace Hub login OK")
         except Exception as e:
             print(f"⚠️ Hub login failed: {e}")
-            print("📝 Training will continue but won't push to Hub")
-            tokn = None  # Disable pushing
     print("📚 Loading CUAD…")
     dataset_checkpoint = f"{CHECKPOINT_DIR}/cuad_dataset.pt"
@@ -241,17 +324,15 @@ def main():
         cuad = Dataset.from_dict(dataset_data)
         print(f"✅ Loaded dataset from checkpoint: {len(cuad)} examples")
     else:
-        # Load and process dataset
         try:
             cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True)
             print(f"✅ Loaded {len(cuad)} examples")
         except Exception as e:
             print(f"❌ Dataset loading failed: {e}")
-            print("🔄 Retrying with cache disabled...")
             cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True, download_mode="force_redownload")
         cuad = cuad.shuffle(seed=SEED)
-        cuad = balance_has_answer(cuad, ratio=2.0)        #  ≈18 k rows
         print(f"📊 Balanced dataset: {len(cuad)} examples")
         # Save dataset checkpoint
@@ -263,7 +344,7 @@ def main():
     # ── tokeniser & model (SQuAD-2 tuned) ───────────────────────────────
     base_ckpt = "deepset/roberta-base-squad2"
-    tok   = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
     model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
     # LoRA
@@ -275,24 +356,30 @@ def main():
     model = get_peft_model(model, lora)
     model.print_trainable_parameters()
-    # ── preprocess with retry logic ─────────────────────────────────────────
-    train_feats = preprocess_with_retry(train_raw, "train")
-    # Remove offset_mapping from training data (not needed during training)
     if "offset_mapping" in train_feats.column_names:
         train_feats = train_feats.remove_columns(["offset_mapping"])
-    val_feats = preprocess_with_retry(val_raw, "val")
-    # Keep offset_mapping for validation (needed for postprocessing)
     # ── training args ──────────────────────────────────────────────────
     args = TrainingArguments(
         output_dir="./cuad_lora_out",
         learning_rate=3e-5,
         num_train_epochs=4,
-        per_device_train_batch_size=8,
-        per_device_eval_batch_size=8,
-        gradient_accumulation_steps=4,      # eff. BS 32
-        fp16=False, bf16=True,              # L4 = bf16
         eval_strategy="steps",
         eval_steps=250,
         save_steps=500,
@@ -305,11 +392,9 @@ def main():
         greater_is_better=True,
         logging_steps=50,
         report_to="none",
-        # Add resume from checkpoint capability
         resume_from_checkpoint=True,
-        # Add dataloader settings for stability
-        dataloader_num_workers=0,  # Disable multiprocessing for data loading
-        dataloader_pin_memory=False,  # Reduce memory pressure
     )
     trainer = Trainer(
@@ -328,46 +413,41 @@ def main():
         print("✅ Training completed successfully!")
     except Exception as e:
         print(f"❌ Training failed: {e}")
-        print("💾 Attempting to save current state...")
         try:
             trainer.save_model("./cuad_lora_out_partial")
             tok.save_pretrained("./cuad_lora_out_partial")
-            print("💾 Partial model saved to ./cuad_lora_out_partial")
         except:
             print("❌ Could not save partial model")
         raise e
-    print("✅ Done.  Best F1:", trainer.state.best_metric)
     trainer.save_model("./cuad_lora_out")
     tok.save_pretrained("./cuad_lora_out")
-    # optional: push (with retry logic)
     if tokn:
-        max_push_retries = 3
-        for push_attempt in range(max_push_retries):
             try:
-                print(f"⬆️ Pushing to Hub (attempt {push_attempt + 1}/{max_push_retries})...")
                 trainer.push_to_hub(model_repo, private=False)
                 tok.push_to_hub(model_repo, private=False)
                 print("🚀 Pushed to:", f"https://huggingface.co/{model_repo}")
                 break
             except Exception as e:
-                print(f"⚠️ Hub push failed on attempt {push_attempt + 1}: {e}")
-                if push_attempt < max_push_retries - 1:
-                    print("⏳ Waiting 30 seconds before retry...")
                     time.sleep(30)
                 else:
-                    print("💾 Model saved locally in ./cuad_lora_out (push failed)")
-    else:
-        print("💾 Model saved locally in ./cuad_lora_out (no HF token for push)")
-    # Clean up checkpoints after successful completion
     try:
         import shutil
         shutil.rmtree(CHECKPOINT_DIR)
         print("🧹 Cleaned up temporary checkpoints")
     except:
-        print("⚠️ Could not clean up temporary checkpoints")
 if __name__ == "__main__":
     main()

 #!/usr/bin/env python
 # train_cuad_lora_improved.py
 """
+CUAD fine-tune with LoRA on L40S GPU in HuggingFace Spaces.
+Improved version with better error handling and chunked processing.
 """
 import os, json, random, gc, time
 import evaluate
 from huggingface_hub import login
+disable_caching()
 # ─────────────────────────────────────────────────────────────── helpers ──
+MAX_LEN     = 384
 DOC_STRIDE  = 128
 SEED        = 42
 CHECKPOINT_DIR = "./cuad_lora_checkpoints"
+CHUNK_SIZE  = 100  # Process in smaller chunks to avoid timeouts
 def set_seed(seed):
     random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
     """Load preprocessing checkpoint from disk"""
     if os.path.exists(checkpoint_path):
         print(f"📂 Loading checkpoint: {checkpoint_path}")
+        return torch.load(checkpoint_path, map_location='cpu')
     return None
+def save_partial_features(features_dict, chunk_idx, dataset_name):
+    """Save partial features for a chunk"""
+    partial_path = f"{CHECKPOINT_DIR}/{dataset_name}_chunk_{chunk_idx:04d}.pt"
+    save_checkpoint(features_dict, partial_path)
+    return partial_path
+def load_and_combine_chunks(dataset_name):
+    """Load all chunk files and combine them"""
+    chunk_files = []
+    if os.path.exists(CHECKPOINT_DIR):
+        for f in os.listdir(CHECKPOINT_DIR):
+            if f.startswith(f"{dataset_name}_chunk_") and f.endswith('.pt'):
+                chunk_files.append(os.path.join(CHECKPOINT_DIR, f))
+    if not chunk_files:
+        return None
+    chunk_files.sort()
+    print(f"📂 Found {len(chunk_files)} chunks for {dataset_name}")
+    # Combine all chunks
+    combined = None
+    for chunk_file in chunk_files:
+        chunk_data = torch.load(chunk_file, map_location='cpu')
+        if combined is None:
+            combined = chunk_data
+        else:
+            for key in chunk_data:
+                combined[key].extend(chunk_data[key])
+    print(f"✅ Combined {len(combined['input_ids'])} features from chunks")
+    return combined
 def balance_has_answer(dataset, ratio=2.0):
     """Keep all has-answer rows, down-sample no-answer rows to `ratio`."""
     has, no = [], []
 # ───────────────────────────────────────────────────────────────── main ──
+def preprocess_single_example(example, tokenizer):
+    """Process a single example to avoid batch processing issues"""
+    # Tokenize
+    tokenized = tokenizer(
+        example["question"],
+        example["context"],
+        truncation="only_second",
+        max_length=MAX_LEN,
+        stride=DOC_STRIDE,
+        return_overflowing_tokens=True,
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+    results = {
+        "input_ids": [],
+        "attention_mask": [],
+        "start_positions": [],
+        "end_positions": [],
+        "example_id": [],
+        "offset_mapping": []
+    }
+    for i in range(len(tokenized["input_ids"])):
+        results["input_ids"].append(tokenized["input_ids"][i])
+        results["attention_mask"].append(tokenized["attention_mask"][i])
+        results["offset_mapping"].append(tokenized["offset_mapping"][i])
+        results["example_id"].append(example["id"])
+        # Handle answer positions
+        answers = example["answers"]
+        offsets = tokenized["offset_mapping"][i]
+        cls_index = 0
+        if not answers["text"] or not answers["text"][0]:
+            results["start_positions"].append(cls_index)
+            results["end_positions"].append(cls_index)
+            continue
+        answer_start = answers["answer_start"][0]
+        answer_text = answers["text"][0]
+        answer_end = answer_start + len(answer_text)
+        start_token = end_token = cls_index
+        for tok_idx, (start_char, end_char) in enumerate(offsets):
+            if start_char <= answer_start < end_char:
+                start_token = tok_idx
+            if start_char < answer_end <= end_char:
+                end_token = tok_idx
+                break
+        if start_token <= end_token and start_token > 0:
+            results["start_positions"].append(start_token)
+            results["end_positions"].append(end_token)
+        else:
+            results["start_positions"].append(cls_index)
+            results["end_positions"].append(cls_index)
+    return results
+def preprocess_with_chunking(dataset, dataset_name, tokenizer):
+    """Process dataset in chunks to avoid timeouts"""
+    # Check if final result already exists
+    final_checkpoint = f"{CHECKPOINT_DIR}/{dataset_name}_features.pt"
+    final_features = load_checkpoint(final_checkpoint)
+    if final_features is not None:
+        print(f"✅ Loaded {dataset_name} features from final checkpoint")
+        return Dataset.from_dict(final_features)
+    # Check if we can resume from chunks
+    combined_features = load_and_combine_chunks(dataset_name)
+    if combined_features is not None:
+        # Save as final checkpoint
+        save_checkpoint(combined_features, final_checkpoint)
+        return Dataset.from_dict(combined_features)
+    # Process in chunks
+    print(f"🔄 Processing {dataset_name} dataset in chunks of {CHUNK_SIZE}...")
+    total_samples = len(dataset)
+    num_chunks = (total_samples + CHUNK_SIZE - 1) // CHUNK_SIZE
+    for chunk_idx in range(num_chunks):
+        chunk_file = f"{CHECKPOINT_DIR}/{dataset_name}_chunk_{chunk_idx:04d}.pt"
+        # Skip if chunk already processed
+        if os.path.exists(chunk_file):
+            print(f"⏭️  Chunk {chunk_idx + 1}/{num_chunks} already exists, skipping...")
+            continue
+        start_idx = chunk_idx * CHUNK_SIZE
+        end_idx = min(start_idx + CHUNK_SIZE, total_samples)
+        print(f"🔄 Processing chunk {chunk_idx + 1}/{num_chunks} (samples {start_idx}-{end_idx-1})...")
+        chunk_results = {
+            "input_ids": [],
+            "attention_mask": [],
+            "start_positions": [],
+            "end_positions": [],
+            "example_id": [],
+            "offset_mapping": []
+        }
+        # Process each example in the chunk individually
+        for i in range(start_idx, end_idx):
+            if i % 10 == 0:  # Progress indicator
+                print(f"  Processing sample {i}/{total_samples}")
+            try:
+                example = dataset[i]
+                result = preprocess_single_example(example, tokenizer)
+                # Add to chunk results
+                for key in chunk_results:
+                    chunk_results[key].extend(result[key])
+            except Exception as e:
+                print(f"⚠️  Error processing sample {i}: {e}")
+                continue
+        # Save chunk
+        save_partial_features(chunk_results, chunk_idx, dataset_name)
+        # Clean up memory
+        del chunk_results
+        gc.collect()
+        print(f"✅ Chunk {chunk_idx + 1}/{num_chunks} completed and saved")
+    # Combine all chunks
+    print("🔄 Combining all chunks...")
+    combined_features = load_and_combine_chunks(dataset_name)
+    if combined_features is None:
+        raise RuntimeError("Failed to load and combine chunks!")
+    # Save final result
+    save_checkpoint(combined_features, final_checkpoint)
+    # Clean up chunk files
+    cleanup_chunk_files(dataset_name)
+    return Dataset.from_dict(combined_features)
+def cleanup_chunk_files(dataset_name):
+    """Remove chunk files after successful combination"""
+    if os.path.exists(CHECKPOINT_DIR):
+        for f in os.listdir(CHECKPOINT_DIR):
+            if f.startswith(f"{dataset_name}_chunk_") and f.endswith('.pt'):
+                try:
+                    os.remove(os.path.join(CHECKPOINT_DIR, f))
+                except:
+                    pass
+    print(f"🧹 Cleaned up chunk files for {dataset_name}")
 def main():
     global val_raw, val_feats, tok
             print("🔑 HuggingFace Hub login OK")
         except Exception as e:
             print(f"⚠️ Hub login failed: {e}")
+            tokn = None
     print("📚 Loading CUAD…")
     dataset_checkpoint = f"{CHECKPOINT_DIR}/cuad_dataset.pt"
         cuad = Dataset.from_dict(dataset_data)
         print(f"✅ Loaded dataset from checkpoint: {len(cuad)} examples")
     else:
         try:
             cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True)
             print(f"✅ Loaded {len(cuad)} examples")
         except Exception as e:
             print(f"❌ Dataset loading failed: {e}")
             cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True, download_mode="force_redownload")
         cuad = cuad.shuffle(seed=SEED)
+        cuad = balance_has_answer(cuad, ratio=2.0)
         print(f"📊 Balanced dataset: {len(cuad)} examples")
         # Save dataset checkpoint
     # ── tokeniser & model (SQuAD-2 tuned) ───────────────────────────────
     base_ckpt = "deepset/roberta-base-squad2"
+    tok = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
     model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
     # LoRA
     model = get_peft_model(model, lora)
     model.print_trainable_parameters()
+    # ── preprocess with chunking ─────────────────────────────────────────
+    print("🔄 Starting preprocessing...")
+    train_feats = preprocess_with_chunking(train_raw, "train", tok)
+    # Remove offset_mapping from training data
     if "offset_mapping" in train_feats.column_names:
         train_feats = train_feats.remove_columns(["offset_mapping"])
+    val_feats = preprocess_with_chunking(val_raw, "val", tok)
+    # Keep offset_mapping for validation
+    print(f"✅ Preprocessing completed!")
+    print(f"   Training features: {len(train_feats)}")
+    print(f"   Validation features: {len(val_feats)}")
     # ── training args ──────────────────────────────────────────────────
     args = TrainingArguments(
         output_dir="./cuad_lora_out",
         learning_rate=3e-5,
         num_train_epochs=4,
+        per_device_train_batch_size=16,  # Increased for L40S
+        per_device_eval_batch_size=16,
+        gradient_accumulation_steps=2,   # Reduced since batch size increased
+        fp16=False, bf16=True,
         eval_strategy="steps",
         eval_steps=250,
         save_steps=500,
         greater_is_better=True,
         logging_steps=50,
         report_to="none",
         resume_from_checkpoint=True,
+        dataloader_num_workers=2,  # L40S can handle more workers
+        dataloader_pin_memory=True,
     )
     trainer = Trainer(
         print("✅ Training completed successfully!")
     except Exception as e:
         print(f"❌ Training failed: {e}")
         try:
             trainer.save_model("./cuad_lora_out_partial")
             tok.save_pretrained("./cuad_lora_out_partial")
+            print("💾 Partial model saved")
         except:
             print("❌ Could not save partial model")
         raise e
+    print("✅ Done. Best F1:", trainer.state.best_metric)
     trainer.save_model("./cuad_lora_out")
     tok.save_pretrained("./cuad_lora_out")
+    # Push to hub with retry logic
     if tokn:
+        for attempt in range(3):
             try:
+                print(f"⬆️ Pushing to Hub (attempt {attempt + 1}/3)...")
                 trainer.push_to_hub(model_repo, private=False)
                 tok.push_to_hub(model_repo, private=False)
                 print("🚀 Pushed to:", f"https://huggingface.co/{model_repo}")
                 break
             except Exception as e:
+                print(f"⚠️ Hub push failed: {e}")
+                if attempt < 2:
                     time.sleep(30)
                 else:
+                    print("💾 Model saved locally (push failed)")
+    # Clean up checkpoints
     try:
         import shutil
         shutil.rmtree(CHECKPOINT_DIR)
         print("🧹 Cleaned up temporary checkpoints")
     except:
+        print("⚠️ Could not clean up checkpoints")
 if __name__ == "__main__":
     main()