Spaces:

AvocadoMuffin
/

roberta_cuad_trainer

Sleeping

App Files Files Community

AvocadoMuffin commited on 19 days ago

Commit

4337e1a

verified ·

1 Parent(s): c48cc67

Update train.py

Browse files

Files changed (1) hide show

train.py +141 -248

train.py CHANGED Viewed

@@ -1,8 +1,8 @@
 #!/usr/bin/env python
-# train_cuad_lora_improved.py
 """
-CUAD fine-tune with LoRA on L40S GPU in HuggingFace Spaces.
-Improved version with better error handling and chunked processing.
 """
 import os, json, random, gc, time
@@ -21,72 +21,42 @@ from huggingface_hub import login
 disable_caching()
-# ─────────────────────────────────────────────────────────────── helpers ──
-MAX_LEN     = 384
-DOC_STRIDE  = 128
-SEED        = 42
-CHECKPOINT_DIR = "./cuad_lora_checkpoints"
-CHUNK_SIZE  = 100  # Process in smaller chunks to avoid timeouts
 def set_seed(seed):
     random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
-def save_checkpoint(data, checkpoint_path):
-    """Save preprocessing checkpoint to disk"""
-    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
-    torch.save(data, checkpoint_path)
-    print(f"💾 Checkpoint saved: {checkpoint_path}")
-def load_checkpoint(checkpoint_path):
-    """Load preprocessing checkpoint from disk"""
-    if os.path.exists(checkpoint_path):
-        print(f"📂 Loading checkpoint: {checkpoint_path}")
-        return torch.load(checkpoint_path, map_location='cpu')
-    return None
-def save_partial_features(features_dict, chunk_idx, dataset_name):
-    """Save partial features for a chunk"""
-    partial_path = f"{CHECKPOINT_DIR}/{dataset_name}_chunk_{chunk_idx:04d}.pt"
-    save_checkpoint(features_dict, partial_path)
-    return partial_path
-def load_and_combine_chunks(dataset_name):
-    """Load all chunk files and combine them"""
-    chunk_files = []
-    if os.path.exists(CHECKPOINT_DIR):
-        for f in os.listdir(CHECKPOINT_DIR):
-            if f.startswith(f"{dataset_name}_chunk_") and f.endswith('.pt'):
-                chunk_files.append(os.path.join(CHECKPOINT_DIR, f))
-    if not chunk_files:
-        return None
-    chunk_files.sort()
-    print(f"📂 Found {len(chunk_files)} chunks for {dataset_name}")
-    # Combine all chunks
-    combined = None
-    for chunk_file in chunk_files:
-        chunk_data = torch.load(chunk_file, map_location='cpu')
-        if combined is None:
-            combined = chunk_data
-        else:
-            for key in chunk_data:
-                combined[key].extend(chunk_data[key])
-    print(f"✅ Combined {len(combined['input_ids'])} features from chunks")
-    return combined
-def balance_has_answer(dataset, ratio=2.0):
     """Keep all has-answer rows, down-sample no-answer rows to `ratio`."""
     has, no = [], []
     for ex in dataset:
         (has if ex["answers"]["text"] else no).append(ex)
     k = int(len(has) * ratio)
     no = random.sample(no, min(k, len(no)))
-    return Dataset.from_list(has + no)
 # ────────────────────────────────────────────────────────────── postproc ──
@@ -104,20 +74,20 @@ def postprocess_qa(examples, features, raw_predictions, tokenizer):
     for example_idx, example in enumerate(examples):
         best_score = -1e9
-        best_span  = ""
-        context    = example["context"]
         for feat_idx in features_per_example[example_idx]:
             start_logit = all_start[feat_idx]
-            end_logit   = all_end[feat_idx]
-            offset      = features["offset_mapping"][feat_idx]
             start_idx = int(np.argmax(start_logit))
-            end_idx   = int(np.argmax(end_logit))
             if start_idx <= end_idx < len(offset):
                 start_char, _ = offset[start_idx]
-                _, end_char   = offset[end_idx]
                 span = context[start_char:end_char].strip()
                 score = start_logit[start_idx] + end_logit[end_idx]
                 if score > best_score and span:
@@ -131,19 +101,25 @@ def postprocess_qa(examples, features, raw_predictions, tokenizer):
 def compute_metrics(eval_pred):
     """Use regular eval_pred structure and correct variable names"""
     predictions = postprocess_qa(val_raw, val_feats, eval_pred.predictions, tok)
-    references  = [
         {"id": ex["id"], "answers": ex["answers"]} for ex in val_raw
     ]
     return metric.compute(predictions=predictions, references=references)
-# ───────────────────────────────────────────────────────────────── main ──
-def preprocess_single_example(example, tokenizer):
-    """Process a single example to avoid batch processing issues"""
-    # Tokenize
-    tokenized = tokenizer(
-        example["question"],
-        example["context"],
         truncation="only_second",
         max_length=MAX_LEN,
         stride=DOC_STRIDE,
@@ -152,160 +128,89 @@ def preprocess_single_example(example, tokenizer):
         padding="max_length",
     )
-    results = {
-        "input_ids": [],
-        "attention_mask": [],
-        "start_positions": [],
-        "end_positions": [],
-        "example_id": [],
-        "offset_mapping": []
-    }
-    for i in range(len(tokenized["input_ids"])):
-        results["input_ids"].append(tokenized["input_ids"][i])
-        results["attention_mask"].append(tokenized["attention_mask"][i])
-        results["offset_mapping"].append(tokenized["offset_mapping"][i])
-        results["example_id"].append(example["id"])
-        # Handle answer positions
-        answers = example["answers"]
-        offsets = tokenized["offset_mapping"][i]
-        cls_index = 0
         if not answers["text"] or not answers["text"][0]:
-            results["start_positions"].append(cls_index)
-            results["end_positions"].append(cls_index)
             continue
-        answer_start = answers["answer_start"][0]
         answer_text = answers["text"][0]
-        answer_end = answer_start + len(answer_text)
-        start_token = end_token = cls_index
-        for tok_idx, (start_char, end_char) in enumerate(offsets):
-            if start_char <= answer_start < end_char:
-                start_token = tok_idx
-            if start_char < answer_end <= end_char:
-                end_token = tok_idx
                 break
-        if start_token <= end_token and start_token > 0:
-            results["start_positions"].append(start_token)
-            results["end_positions"].append(end_token)
         else:
-            results["start_positions"].append(cls_index)
-            results["end_positions"].append(cls_index)
-    return results
-def preprocess_with_chunking(dataset, dataset_name, tokenizer):
-    """Process dataset in chunks to avoid timeouts"""
-    # Check if final result already exists
-    final_checkpoint = f"{CHECKPOINT_DIR}/{dataset_name}_features.pt"
-    final_features = load_checkpoint(final_checkpoint)
-    if final_features is not None:
-        print(f"✅ Loaded {dataset_name} features from final checkpoint")
-        return Dataset.from_dict(final_features)
-    # Check if we can resume from chunks
-    combined_features = load_and_combine_chunks(dataset_name)
-    if combined_features is not None:
-        # Save as final checkpoint
-        save_checkpoint(combined_features, final_checkpoint)
-        return Dataset.from_dict(combined_features)
-    # Process in chunks
-    print(f"🔄 Processing {dataset_name} dataset in chunks of {CHUNK_SIZE}...")
-    total_samples = len(dataset)
-    num_chunks = (total_samples + CHUNK_SIZE - 1) // CHUNK_SIZE
-    for chunk_idx in range(num_chunks):
-        chunk_file = f"{CHECKPOINT_DIR}/{dataset_name}_chunk_{chunk_idx:04d}.pt"
-        # Skip if chunk already processed
-        if os.path.exists(chunk_file):
-            print(f"⏭️  Chunk {chunk_idx + 1}/{num_chunks} already exists, skipping...")
-            continue
-        start_idx = chunk_idx * CHUNK_SIZE
-        end_idx = min(start_idx + CHUNK_SIZE, total_samples)
-        print(f"🔄 Processing chunk {chunk_idx + 1}/{num_chunks} (samples {start_idx}-{end_idx-1})...")
-        chunk_results = {
-            "input_ids": [],
-            "attention_mask": [],
-            "start_positions": [],
-            "end_positions": [],
-            "example_id": [],
-            "offset_mapping": []
-        }
-        # Process each example in the chunk individually
-        for i in range(start_idx, end_idx):
-            if i % 10 == 0:  # Progress indicator
-                print(f"  Processing sample {i}/{total_samples}")
-            try:
-                example = dataset[i]
-                result = preprocess_single_example(example, tokenizer)
-                # Add to chunk results
-                for key in chunk_results:
-                    chunk_results[key].extend(result[key])
-            except Exception as e:
-                print(f"⚠️  Error processing sample {i}: {e}")
-                continue
-        # Save chunk
-        save_partial_features(chunk_results, chunk_idx, dataset_name)
-        # Clean up memory
-        del chunk_results
-        gc.collect()
-        print(f"✅ Chunk {chunk_idx + 1}/{num_chunks} completed and saved")
-    # Combine all chunks
-    print("🔄 Combining all chunks...")
-    combined_features = load_and_combine_chunks(dataset_name)
-    if combined_features is None:
-        raise RuntimeError("Failed to load and combine chunks!")
-    # Save final result
-    save_checkpoint(combined_features, final_checkpoint)
-    # Clean up chunk files
-    cleanup_chunk_files(dataset_name)
-    return Dataset.from_dict(combined_features)
-def cleanup_chunk_files(dataset_name):
-    """Remove chunk files after successful combination"""
-    if os.path.exists(CHECKPOINT_DIR):
-        for f in os.listdir(CHECKPOINT_DIR):
-            if f.startswith(f"{dataset_name}_chunk_") and f.endswith('.pt'):
-                try:
-                    os.remove(os.path.join(CHECKPOINT_DIR, f))
-                except:
-                    pass
-    print(f"🧹 Cleaned up chunk files for {dataset_name}")
 def main():
     global val_raw, val_feats, tok
     set_seed(SEED)
-    # Create checkpoint directory
-    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
     # Model name to store on Hub
-    model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v2")
     if (tokn := os.getenv("roberta_token")):
         try:
@@ -316,27 +221,19 @@ def main():
             tokn = None
     print("📚 Loading CUAD…")
-    dataset_checkpoint = f"{CHECKPOINT_DIR}/cuad_dataset.pt"
-    # Try to load dataset from checkpoint
-    dataset_data = load_checkpoint(dataset_checkpoint)
-    if dataset_data is not None:
-        cuad = Dataset.from_dict(dataset_data)
-        print(f"✅ Loaded dataset from checkpoint: {len(cuad)} examples")
-    else:
-        try:
-            cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True)
-            print(f"✅ Loaded {len(cuad)} examples")
-        except Exception as e:
-            print(f"❌ Dataset loading failed: {e}")
-            cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True, download_mode="force_redownload")
-        cuad = cuad.shuffle(seed=SEED)
-        cuad = balance_has_answer(cuad, ratio=2.0)
-        print(f"📊 Balanced dataset: {len(cuad)} examples")
-        # Save dataset checkpoint
-        save_checkpoint(cuad.to_dict(), dataset_checkpoint)
     # train / val 90-10
     ds = cuad.train_test_split(test_size=0.1, seed=SEED)
@@ -347,42 +244,46 @@ def main():
     tok = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
     model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
-    # LoRA
     lora = LoraConfig(
         task_type=TaskType.QUESTION_ANS,
-        r=16, lora_alpha=32, lora_dropout=0.05,
-        target_modules=["query", "value"],
     )
     model = get_peft_model(model, lora)
     model.print_trainable_parameters()
-    # ── preprocess with chunking ─────────────────────────────────────────
-    print("🔄 Starting preprocessing...")
-    train_feats = preprocess_with_chunking(train_raw, "train", tok)
-    # Remove offset_mapping from training data
     if "offset_mapping" in train_feats.column_names:
         train_feats = train_feats.remove_columns(["offset_mapping"])
-    val_feats = preprocess_with_chunking(val_raw, "val", tok)
-    # Keep offset_mapping for validation
     print(f"✅ Preprocessing completed!")
     print(f"   Training features: {len(train_feats)}")
     print(f"   Validation features: {len(val_feats)}")
     # ── training args ──────────────────────────────────────────────────
     args = TrainingArguments(
         output_dir="./cuad_lora_out",
-        learning_rate=3e-5,
-        num_train_epochs=4,
-        per_device_train_batch_size=16,  # Increased for L40S
         per_device_eval_batch_size=16,
-        gradient_accumulation_steps=2,   # Reduced since batch size increased
         fp16=False, bf16=True,
         eval_strategy="steps",
-        eval_steps=250,
-        save_steps=500,
         save_total_limit=2,
         weight_decay=0.01,
         lr_scheduler_type="cosine",
@@ -390,11 +291,11 @@ def main():
         load_best_model_at_end=True,
         metric_for_best_model="f1",
         greater_is_better=True,
-        logging_steps=50,
         report_to="none",
-        resume_from_checkpoint=True,
-        dataloader_num_workers=2,  # L40S can handle more workers
         dataloader_pin_memory=True,
     )
     trainer = Trainer(
@@ -441,13 +342,5 @@ def main():
                 else:
                     print("💾 Model saved locally (push failed)")
-    # Clean up checkpoints
-    try:
-        import shutil
-        shutil.rmtree(CHECKPOINT_DIR)
-        print("🧹 Cleaned up temporary checkpoints")
-    except:
-        print("⚠️ Could not clean up checkpoints")
 if __name__ == "__main__":
     main()

 #!/usr/bin/env python
+# train_cuad_lora_efficient.py
 """
+CUAD fine-tune with LoRA - Efficient batch processing version.
+Fixes bottlenecks and uses proper batching instead of chunking.
 """
 import os, json, random, gc, time
 disable_caching()
+# ─────────────────────────────────────────────────────────────── config ──
+MAX_LEN = 384
+DOC_STRIDE = 128
+SEED = 42
+BATCH_SIZE = 1000  # Process in larger, more efficient batches
+# Reduced dataset size option
+USE_SUBSET = True  # Set to True to use only 10k examples
+SUBSET_SIZE = 10000
 def set_seed(seed):
     random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
+def balance_has_answer(dataset, ratio=2.0, max_samples=None):
     """Keep all has-answer rows, down-sample no-answer rows to `ratio`."""
     has, no = [], []
     for ex in dataset:
         (has if ex["answers"]["text"] else no).append(ex)
+    print(f"📊 Original: {len(has)} has-answer, {len(no)} no-answer")
     k = int(len(has) * ratio)
     no = random.sample(no, min(k, len(no)))
+    balanced = has + no
+    # Apply subset limit if specified
+    if max_samples and len(balanced) > max_samples:
+        balanced = random.sample(balanced, max_samples)
+        print(f"📉 Reduced to {max_samples} samples for faster training")
+    print(f"📊 Balanced: {len([x for x in balanced if x['answers']['text']])} has-answer, {len([x for x in balanced if not x['answers']['text']])} no-answer")
+    return Dataset.from_list(balanced)
 # ────────────────────────────────────────────────────────────── postproc ──
     for example_idx, example in enumerate(examples):
         best_score = -1e9
+        best_span = ""
+        context = example["context"]
         for feat_idx in features_per_example[example_idx]:
             start_logit = all_start[feat_idx]
+            end_logit = all_end[feat_idx]
+            offset = features["offset_mapping"][feat_idx]
             start_idx = int(np.argmax(start_logit))
+            end_idx = int(np.argmax(end_logit))
             if start_idx <= end_idx < len(offset):
                 start_char, _ = offset[start_idx]
+                _, end_char = offset[end_idx]
                 span = context[start_char:end_char].strip()
                 score = start_logit[start_idx] + end_logit[end_idx]
                 if score > best_score and span:
 def compute_metrics(eval_pred):
     """Use regular eval_pred structure and correct variable names"""
     predictions = postprocess_qa(val_raw, val_feats, eval_pred.predictions, tok)
+    references = [
         {"id": ex["id"], "answers": ex["answers"]} for ex in val_raw
     ]
     return metric.compute(predictions=predictions, references=references)
+# ───────────────────────────────────────────────────────────── preprocessing ──
+def preprocess_batch_efficient(examples, tokenizer):
+    """
+    Efficient batch preprocessing using HuggingFace's built-in batch processing.
+    This is much faster than processing examples individually.
+    """
+    questions = examples["question"]
+    contexts = examples["context"]
+    # Batch tokenization - this is the key efficiency gain
+    tokenized_examples = tokenizer(
+        questions,
+        contexts,
         truncation="only_second",
         max_length=MAX_LEN,
         stride=DOC_STRIDE,
         padding="max_length",
     )
+    # Map back to original examples
+    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+    # Initialize output
+    start_positions = []
+    end_positions = []
+    for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
+        input_ids = tokenized_examples["input_ids"][i]
+        cls_index = 0  # CLS token position
+        # Get the original example for this tokenized chunk
+        sample_index = sample_mapping[i]
+        answers = examples["answers"][sample_index]
+        # Handle cases with no answer
         if not answers["text"] or not answers["text"][0]:
+            start_positions.append(cls_index)
+            end_positions.append(cls_index)
             continue
+        # Find answer span in tokens
+        answer_start_char = answers["answer_start"][0]
         answer_text = answers["text"][0]
+        answer_end_char = answer_start_char + len(answer_text)
+        # Find token positions
+        token_start_index = cls_index
+        token_end_index = cls_index
+        for token_index, (start_char, end_char) in enumerate(offsets):
+            if start_char <= answer_start_char < end_char:
+                token_start_index = token_index
+            if start_char < answer_end_char <= end_char:
+                token_end_index = token_index
                 break
+        # Validate positions
+        if token_start_index <= token_end_index and token_start_index > 0:
+            start_positions.append(token_start_index)
+            end_positions.append(token_end_index)
         else:
+            start_positions.append(cls_index)
+            end_positions.append(cls_index)
+    tokenized_examples["start_positions"] = start_positions
+    tokenized_examples["end_positions"] = end_positions
+    # Add example IDs for evaluation
+    tokenized_examples["example_id"] = [
+        examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
+    ]
+    return tokenized_examples
+def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing"):
+    """
+    Process dataset in batches using HuggingFace's map function with batching.
+    This is much more memory efficient and faster than manual chunking.
+    """
+    print(f"🔄 {desc} dataset with batch processing...")
+    processed = dataset.map(
+        lambda examples: preprocess_batch_efficient(examples, tokenizer),
+        batched=True,
+        batch_size=BATCH_SIZE,
+        remove_columns=dataset.column_names,
+        desc=desc,
+        num_proc=1,  # Use 1 process to avoid memory issues in Spaces
+    )
+    print(f"✅ {desc} completed: {len(processed)} features")
+    return processed
+# ─────────────────────────────────────────────────���─────────────── main ──
 def main():
     global val_raw, val_feats, tok
     set_seed(SEED)
     # Model name to store on Hub
+    model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v3")
     if (tokn := os.getenv("roberta_token")):
         try:
             tokn = None
     print("📚 Loading CUAD…")
+    try:
+        cuad = load_dataset("theatricusproject/cuad-qa", split="train", trust_remote_code=True)
+        print(f"✅ Loaded {len(cuad)} examples")
+    except Exception as e:
+        print(f"❌ Dataset loading failed: {e}")
+        cuad = load_dataset("theatricusproject/cuad-qa", split="train", trust_remote_code=True, download_mode="force_redownload")
+    cuad = cuad.shuffle(seed=SEED)
+    # Apply subset reduction if enabled
+    subset_size = SUBSET_SIZE if USE_SUBSET else None
+    cuad = balance_has_answer(cuad, ratio=2.0, max_samples=subset_size)
+    print(f"📊 Final dataset size: {len(cuad)} examples")
     # train / val 90-10
     ds = cuad.train_test_split(test_size=0.1, seed=SEED)
     tok = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
     model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
+    # LoRA with slightly more aggressive settings for smaller dataset
     lora = LoraConfig(
         task_type=TaskType.QUESTION_ANS,
+        r=32, lora_alpha=64, lora_dropout=0.1,  # Increased for better learning with less data
+        target_modules=["query", "value", "key", "dense"],  # More modules for better coverage
     )
     model = get_peft_model(model, lora)
     model.print_trainable_parameters()
+    # ── efficient preprocessing ─────────────────────────────────────────
+    print("🔄 Starting efficient preprocessing...")
+    # Process training data
+    train_feats = preprocess_dataset_streaming(train_raw, tok, "Training")
+    # Remove offset_mapping for training
     if "offset_mapping" in train_feats.column_names:
         train_feats = train_feats.remove_columns(["offset_mapping"])
+    # Process validation data (keep offset_mapping for evaluation)
+    val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation")
     print(f"✅ Preprocessing completed!")
     print(f"   Training features: {len(train_feats)}")
     print(f"   Validation features: {len(val_feats)}")
     # ── training args ──────────────────────────────────────────────────
+    # Adjusted for smaller dataset
+    total_steps = (len(train_feats) // 16 // 2) * 6  # Rough estimate
     args = TrainingArguments(
         output_dir="./cuad_lora_out",
+        learning_rate=5e-5,  # Slightly higher for smaller dataset
+        num_train_epochs=6 if USE_SUBSET else 4,  # More epochs for smaller dataset
+        per_device_train_batch_size=16,
         per_device_eval_batch_size=16,
+        gradient_accumulation_steps=2,
         fp16=False, bf16=True,
         eval_strategy="steps",
+        eval_steps=max(100, total_steps // 20),  # Adaptive eval steps
+        save_steps=max(200, total_steps // 10),  # Adaptive save steps
         save_total_limit=2,
         weight_decay=0.01,
         lr_scheduler_type="cosine",
         load_best_model_at_end=True,
         metric_for_best_model="f1",
         greater_is_better=True,
+        logging_steps=25,
         report_to="none",
+        dataloader_num_workers=2,
         dataloader_pin_memory=True,
+        remove_unused_columns=False,  # Keep example_id for evaluation
     )
     trainer = Trainer(
                 else:
                     print("💾 Model saved locally (push failed)")
 if __name__ == "__main__":
     main()