Spaces:

AvocadoMuffin
/

roberta_cuad_trainer

Sleeping

App Files Files Community

AvocadoMuffin commited on 18 days ago

Commit

bad80b7

verified ·

1 Parent(s): 945a2ea

Update train.py

Browse files

Files changed (1) hide show

train.py +62 -75

train.py CHANGED Viewed

@@ -1,9 +1,7 @@
 #!/usr/bin/env python
-# train_cuad_lora_efficient.py
 """
-CUAD fine-tune with LoRA - Efficient batch processing version.
-Fixes bottlenecks and uses proper batching instead of chunking.
-GUARANTEED FIX for offset_mapping error AND metric computation issues.
 """
 import os, json, random, gc, time
@@ -27,14 +25,14 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # ─────────────────────────────────────────────────────────────── config ──
-MAX_LEN = 384
-DOC_STRIDE = 128
 SEED = 42
 BATCH_SIZE = 1000  # Process in larger, more efficient batches
-# Reduced dataset size option
-USE_SUBSET = True  # Set to True to use only 10k examples
-SUBSET_SIZE = 10000
 def set_seed(seed):
     random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
@@ -48,17 +46,29 @@ def balance_has_answer(dataset, ratio=2.0, max_samples=None):
     print(f"📊 Original: {len(has)} has-answer, {len(no)} no-answer")
     k = int(len(has) * ratio)
-    no = random.sample(no, min(k, len(no)))
     balanced = has + no
-    # Apply subset limit if specified
-    if max_samples and len(balanced) > max_samples:
-        balanced = random.sample(balanced, max_samples)
-        print(f"📉 Reduced to {max_samples} samples for faster training")
-    print(f"📊 Balanced: {len([x for x in balanced if x['answers']['text']])} has-answer, {len([x for x in balanced if not x['answers']['text']])} no-answer")
     return Dataset.from_list(balanced)
@@ -105,13 +115,10 @@ def postprocess_qa(examples, features, raw_predictions, tokenizer):
 # ───────────────────────────────────────────────────────────── preprocessing ──
 def preprocess_training_batch(examples, tokenizer):
-    """
-    Training preprocessing - NO offset_mapping included
-    """
     questions = examples["question"]
     contexts = examples["context"]
-    # Batch tokenization
     tokenized_examples = tokenizer(
         questions,
         contexts,
@@ -119,37 +126,30 @@ def preprocess_training_batch(examples, tokenizer):
         max_length=MAX_LEN,
         stride=DOC_STRIDE,
         return_overflowing_tokens=True,
-        return_offsets_mapping=True,  # We need this temporarily for position calculation
         padding="max_length",
     )
-    # Map back to original examples
     sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
-    offset_mapping = tokenized_examples.pop("offset_mapping")  # Remove it immediately after use
-    # Initialize output
     start_positions = []
     end_positions = []
     for i, offsets in enumerate(offset_mapping):
-        cls_index = 0  # CLS token position
-        # Get the original example for this tokenized chunk
         sample_index = sample_mapping[i]
         answers = examples["answers"][sample_index]
-        # Handle cases with no answer
         if not answers["text"] or not answers["text"][0]:
             start_positions.append(cls_index)
             end_positions.append(cls_index)
             continue
-        # Find answer span in tokens
         answer_start_char = answers["answer_start"][0]
         answer_text = answers["text"][0]
         answer_end_char = answer_start_char + len(answer_text)
-        # Find token positions
         token_start_index = cls_index
         token_end_index = cls_index
@@ -160,7 +160,6 @@ def preprocess_training_batch(examples, tokenizer):
                 token_end_index = token_index
                 break
-        # Validate positions
         if token_start_index <= token_end_index and token_start_index > 0:
             start_positions.append(token_start_index)
             end_positions.append(token_end_index)
@@ -171,17 +170,13 @@ def preprocess_training_batch(examples, tokenizer):
     tokenized_examples["start_positions"] = start_positions
     tokenized_examples["end_positions"] = end_positions
-    # NO offset_mapping or example_id for training
     return tokenized_examples
 def preprocess_validation_batch(examples, tokenizer):
-    """
-    Validation preprocessing - INCLUDES offset_mapping and example_id for post-processing
-    """
     questions = examples["question"]
     contexts = examples["context"]
-    # Batch tokenization
     tokenized_examples = tokenizer(
         questions,
         contexts,
@@ -193,21 +188,16 @@ def preprocess_validation_batch(examples, tokenizer):
         padding="max_length",
     )
-    # Map back to original examples
     sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
-    # Add example IDs for evaluation
     tokenized_examples["example_id"] = [
         examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
     ]
-    # Keep offset_mapping for post-processing
     return tokenized_examples
 def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_training=True):
-    """
-    Process dataset in batches using HuggingFace's map function with batching.
-    """
     print(f"🔄 {desc} dataset with batch processing...")
     if is_training:
@@ -232,8 +222,7 @@ def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_train
 def main():
     set_seed(SEED)
-    # Model name to store on Hub
-    model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v3")
     if (tokn := os.getenv("roberta_token")):
         try:
@@ -253,70 +242,70 @@ def main():
     cuad = cuad.shuffle(seed=SEED)
-    # Apply subset reduction if enabled
     subset_size = SUBSET_SIZE if USE_SUBSET else None
-    cuad = balance_has_answer(cuad, ratio=2.0, max_samples=subset_size)
     print(f"📊 Final dataset size: {len(cuad)} examples")
-    # train / val 90-10
     ds = cuad.train_test_split(test_size=0.1, seed=SEED)
     train_raw, val_raw = ds["train"], ds["test"]
-    # ── tokeniser & model (SQuAD-2 tuned) ───────────────────────────────
     base_ckpt = "deepset/roberta-base-squad2"
     tok = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
     model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
-    # LoRA with slightly more aggressive settings for smaller dataset
     lora = LoraConfig(
         task_type=TaskType.QUESTION_ANS,
-        r=32, lora_alpha=64, lora_dropout=0.1,
-        target_modules=["query", "value", "key", "dense"],
     )
     model = get_peft_model(model, lora)
     model.print_trainable_parameters()
-    # ── efficient preprocessing ─────────────────────────────────────────
-    print("🔄 Starting efficient preprocessing...")
-    # Process training data (NO offset_mapping)
     train_feats = preprocess_dataset_streaming(train_raw, tok, "Training", is_training=True)
-    # Process validation data (WITH offset_mapping)
     val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation", is_training=False)
     print(f"✅ Preprocessing completed!")
     print(f"   Training features: {len(train_feats)}")
     print(f"   Validation features: {len(val_feats)}")
-    print(f"   Training columns: {train_feats.column_names}")
-    print(f"   Validation columns: {val_feats.column_names}")
-    # ── No custom compute_metrics - just use loss for monitoring ──
-    # ── training args ──────────────
-    batch_size = 16
     gradient_accumulation_steps = 2
     effective_batch_size = batch_size * gradient_accumulation_steps
-    num_epochs = 4 if USE_SUBSET else 3
     steps_per_epoch = len(train_feats) // effective_batch_size
     total_steps = steps_per_epoch * num_epochs
-    eval_steps = max(50, steps_per_epoch // 4)
-    save_steps = eval_steps * 2
     print(f"📊 Training configuration:")
     print(f"   Steps per epoch: {steps_per_epoch}")
     print(f"   Total steps: {total_steps}")
-    print(f"   Eval steps: {eval_steps}")
-    print(f"   Save steps: {save_steps}")
     args = TrainingArguments(
         output_dir="./cuad_lora_out",
-        learning_rate=5e-5,
         num_train_epochs=num_epochs,
         per_device_train_batch_size=batch_size,
-        per_device_eval_batch_size=16,
         gradient_accumulation_steps=gradient_accumulation_steps,
         fp16=False, bf16=True,
         eval_strategy="steps",
@@ -326,10 +315,8 @@ def main():
         weight_decay=0.01,
         lr_scheduler_type="cosine",
         warmup_ratio=0.1,
-        load_best_model_at_end=False,  # Disable to avoid metric dependency
-        # metric_for_best_model="eval_loss",  # Disabled - no best model selection
-        # greater_is_better=False,  # Disabled
-        logging_steps=25,
         report_to="none",
         dataloader_num_workers=2,
         dataloader_pin_memory=True,
@@ -343,7 +330,7 @@ def main():
         eval_dataset=val_feats,
         tokenizer=tok,
         data_collator=default_data_collator,
-        compute_metrics=None,  # No custom metrics - just use loss
     )
     print("🚀 Training…")
@@ -364,7 +351,7 @@ def main():
     trainer.save_model("./cuad_lora_out")
     tok.save_pretrained("./cuad_lora_out")
-    # Push to hub with retry logic
     if tokn:
         for attempt in range(3):
             try:

 #!/usr/bin/env python
+# train_cuad_lora_efficient.py - FIXED VERSION
 """
+CUAD fine-tune with LoRA - Fixed for realistic training times
 """
 import os, json, random, gc, time
 # ─────────────────────────────────────────────────────────────── config ──
+MAX_LEN = 512  # Slightly longer context
+DOC_STRIDE = 256   # Larger stride = fewer chunks = faster training
 SEED = 42
 BATCH_SIZE = 1000  # Process in larger, more efficient batches
+# Back to reasonable subset size since you've trained 5k before
+USE_SUBSET = True
+SUBSET_SIZE = 7000  # Good middle ground - more than your 5k success
 def set_seed(seed):
     random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
     print(f"📊 Original: {len(has)} has-answer, {len(no)} no-answer")
+    # FIXED: Apply max_samples FIRST, then balance
+    if max_samples:
+        total_available = len(has) + len(no)
+        if total_available > max_samples:
+            # Sample proportionally from original distribution
+            has_ratio = len(has) / total_available
+            target_has = int(max_samples * has_ratio)
+            target_no = max_samples - target_has
+            has = random.sample(has, min(target_has, len(has)))
+            no = random.sample(no, min(target_no, len(no)))
+            print(f"📉 Pre-balance subset: {len(has)} has-answer, {len(no)} no-answer")
+    # Now balance within the subset
     k = int(len(has) * ratio)
+    if len(no) > k:
+        no = random.sample(no, k)
     balanced = has + no
+    random.shuffle(balanced)  # Shuffle the final dataset
+    print(f"📊 Final balanced: {len([x for x in balanced if x['answers']['text']])} has-answer, {len([x for x in balanced if not x['answers']['text']])} no-answer")
+    print(f"📊 Total examples: {len(balanced)}")
     return Dataset.from_list(balanced)
 # ───────────────────────────────────────────────────────────── preprocessing ──
 def preprocess_training_batch(examples, tokenizer):
+    """Training preprocessing - NO offset_mapping included"""
     questions = examples["question"]
     contexts = examples["context"]
     tokenized_examples = tokenizer(
         questions,
         contexts,
         max_length=MAX_LEN,
         stride=DOC_STRIDE,
         return_overflowing_tokens=True,
+        return_offsets_mapping=True,
         padding="max_length",
     )
     sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+    offset_mapping = tokenized_examples.pop("offset_mapping")
     start_positions = []
     end_positions = []
     for i, offsets in enumerate(offset_mapping):
+        cls_index = 0
         sample_index = sample_mapping[i]
         answers = examples["answers"][sample_index]
         if not answers["text"] or not answers["text"][0]:
             start_positions.append(cls_index)
             end_positions.append(cls_index)
             continue
         answer_start_char = answers["answer_start"][0]
         answer_text = answers["text"][0]
         answer_end_char = answer_start_char + len(answer_text)
         token_start_index = cls_index
         token_end_index = cls_index
                 token_end_index = token_index
                 break
         if token_start_index <= token_end_index and token_start_index > 0:
             start_positions.append(token_start_index)
             end_positions.append(token_end_index)
     tokenized_examples["start_positions"] = start_positions
     tokenized_examples["end_positions"] = end_positions
     return tokenized_examples
 def preprocess_validation_batch(examples, tokenizer):
+    """Validation preprocessing - INCLUDES offset_mapping and example_id"""
     questions = examples["question"]
     contexts = examples["context"]
     tokenized_examples = tokenizer(
         questions,
         contexts,
         padding="max_length",
     )
     sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
     tokenized_examples["example_id"] = [
         examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
     ]
     return tokenized_examples
 def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_training=True):
+    """Process dataset in batches using HuggingFace's map function with batching."""
     print(f"🔄 {desc} dataset with batch processing...")
     if is_training:
 def main():
     set_seed(SEED)
+    model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v4")
     if (tokn := os.getenv("roberta_token")):
         try:
     cuad = cuad.shuffle(seed=SEED)
+    # FIXED: Apply subset reduction more aggressively
     subset_size = SUBSET_SIZE if USE_SUBSET else None
+    cuad = balance_has_answer(cuad, ratio=1.5, max_samples=subset_size)  # Reduced ratio too
     print(f"📊 Final dataset size: {len(cuad)} examples")
+    # Estimate features after preprocessing
+    avg_features_per_example = 2.5  # Conservative estimate with stride
+    estimated_features = len(cuad) * avg_features_per_example
+    print(f"📊 Estimated training features: ~{int(estimated_features)}")
     ds = cuad.train_test_split(test_size=0.1, seed=SEED)
     train_raw, val_raw = ds["train"], ds["test"]
+    # ── tokeniser & model ──
     base_ckpt = "deepset/roberta-base-squad2"
     tok = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
     model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
+    # FIXED: Lighter LoRA config for faster training
     lora = LoraConfig(
         task_type=TaskType.QUESTION_ANS,
+        r=16,  # Reduced from 32
+        lora_alpha=32,  # Reduced from 64
+        lora_dropout=0.1,
+        target_modules=["query", "value"],  # Fewer modules
     )
     model = get_peft_model(model, lora)
     model.print_trainable_parameters()
+    # ── preprocessing ─────────────────────────────────────────
+    print("🔄 Starting preprocessing...")
     train_feats = preprocess_dataset_streaming(train_raw, tok, "Training", is_training=True)
     val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation", is_training=False)
     print(f"✅ Preprocessing completed!")
     print(f"   Training features: {len(train_feats)}")
     print(f"   Validation features: {len(val_feats)}")
+    # ── training args - FIXED for reasonable training time ──
+    batch_size = 16  # Good balance
     gradient_accumulation_steps = 2
     effective_batch_size = batch_size * gradient_accumulation_steps
+    num_epochs = 3  # Keep it reasonable
     steps_per_epoch = len(train_feats) // effective_batch_size
     total_steps = steps_per_epoch * num_epochs
+    eval_steps = max(25, steps_per_epoch // 8)  # More frequent eval
+    save_steps = eval_steps * 3
     print(f"📊 Training configuration:")
+    print(f"   Effective batch size: {effective_batch_size}")
     print(f"   Steps per epoch: {steps_per_epoch}")
     print(f"   Total steps: {total_steps}")
+    print(f"   Estimated time: ~{total_steps/2.4/60:.1f} minutes")
+    print(f"   Eval every: {eval_steps} steps")
     args = TrainingArguments(
         output_dir="./cuad_lora_out",
+        learning_rate=3e-5,  # Slightly lower LR
         num_train_epochs=num_epochs,
         per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=8,
         gradient_accumulation_steps=gradient_accumulation_steps,
         fp16=False, bf16=True,
         eval_strategy="steps",
         weight_decay=0.01,
         lr_scheduler_type="cosine",
         warmup_ratio=0.1,
+        load_best_model_at_end=False,
+        logging_steps=10,  # More frequent logging
         report_to="none",
         dataloader_num_workers=2,
         dataloader_pin_memory=True,
         eval_dataset=val_feats,
         tokenizer=tok,
         data_collator=default_data_collator,
+        compute_metrics=None,
     )
     print("🚀 Training…")
     trainer.save_model("./cuad_lora_out")
     tok.save_pretrained("./cuad_lora_out")
+    # Push to hub
     if tokn:
         for attempt in range(3):
             try: