Spaces:

AvocadoMuffin
/

roberta_cuad_trainer

Sleeping

App Files Files Community

AvocadoMuffin commited on 19 days ago

Commit

d8f4b0f

verified ·

1 Parent(s): 06dc84e

Update train.py

Browse files

Files changed (1) hide show

train.py +163 -375

train.py CHANGED Viewed

@@ -1,409 +1,197 @@
-import torch, gc, os, numpy as np, evaluate, json
-from datasets import load_dataset
 from transformers import (
     AutoTokenizer, AutoModelForQuestionAnswering,
-    TrainingArguments, Trainer, default_data_collator
 )
 from peft import LoraConfig, get_peft_model, TaskType
 from huggingface_hub import login
-import sys
 def main():
-    # Get model name from environment
-    model_name = os.environ.get('MODEL_NAME', 'AvocadoMuffin/roberta-cuad-qa')
-    # Login to HF Hub
-    hf_token = os.environ.get('roberta_token')
-    if hf_token:
-        try:
-            login(token=hf_token)
-            print("✅ Logged into Hugging Face Hub")
-        except Exception as e:
-            print(f"⚠️ HF Hub login failed: {e}")
-            print("⚠️ Model won't be pushed to Hub")
-            hf_token = None
-    else:
-        print("⚠️ No roberta_token found - model won't be pushed to Hub")
-    # Setup
-    torch.cuda.empty_cache()
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"🔧 Using device: {device}")
-    if torch.cuda.is_available():
-        print(f"🎯 GPU: {torch.cuda.get_device_name()}")
-        print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
-    # Load and prepare data - OPTIMIZED SIZE FOR FASTER TRAINING
-    print("📚 Loading CUAD dataset...")
-    try:
-        raw = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True)
-    except Exception as e:
-        print(f"❌ Failed to load dataset: {e}")
-        return
-    # Use 4000 samples for good model quality - expect ~1 hour training
-    N = 5000  # Good balance of quality and reasonable training time
-    raw = raw.shuffle(seed=42).select(range(min(N, len(raw))))
-    ds = raw.train_test_split(test_size=0.1, seed=42)
-    train_ds, val_ds = ds["train"], ds["test"]
-    print(f"✅ Data loaded - Train: {len(train_ds)}, Val: {len(val_ds)}")
-    # Store original validation data for metrics - CRITICAL FOR CORRECT EVALUATION
-    print("📊 Preparing metrics data...")
-    original_val_data = []
-    # Store validation answers before tokenization
-    for i, ex in enumerate(val_ds):
-        original_val_data.append(ex["answers"])
-    # Load model and tokenizer
-    print("🤖 Loading RoBERTa model...")
-    base_model = "roberta-base"
-    try:
-        tok = AutoTokenizer.from_pretrained(base_model, use_fast=True)
-        model = AutoModelForQuestionAnswering.from_pretrained(base_model)
-    except Exception as e:
-        print(f"❌ Failed to load model/tokenizer: {e}")
-        return
-    # Add LoRA
-    print("🔧 Adding LoRA adapters...")
-    lora_cfg = LoraConfig(
         task_type=TaskType.QUESTION_ANS,
         target_modules=["query", "value"],
-        r=16,
-        lora_alpha=32,
-        lora_dropout=0.05,
     )
-    model = get_peft_model(model, lora_cfg)
     model.print_trainable_parameters()
-    model.to(device)
-    # Tokenization function - OPTIMIZED TO PREVENT EXCESSIVE EXPANSION
-    max_len, doc_stride = 512, 400  # Large stride to minimize chunks per document
     def preprocess(examples):
-        tok_batch = tok(
             examples["question"],
             examples["context"],
             truncation="only_second",
-            max_length=max_len,
-            stride=doc_stride,
             return_overflowing_tokens=True,
             return_offsets_mapping=True,
             padding="max_length",
-        )
-        sample_map = tok_batch.pop("overflow_to_sample_mapping")
-        offset_map = tok_batch.pop("offset_mapping")
-        start_pos, end_pos = [], []
-        for i, offsets in enumerate(offset_map):
-            cls_idx = tok_batch["input_ids"][i].index(tok.cls_token_id)
-            sample_idx = sample_map[i]
-            answer = examples["answers"][sample_idx]
-            if len(answer["answer_start"]) == 0:
-                start_pos.append(cls_idx)
-                end_pos.append(cls_idx)
-                continue
-            s_char = answer["answer_start"][0]
-            e_char = s_char + len(answer["text"][0])
-            seq_ids = tok_batch.sequence_ids(i)
-            c0, c1 = seq_ids.index(1), len(seq_ids) - 1 - seq_ids[::-1].index(1)
-            if not (offsets[c0][0] <= s_char <= offsets[c1][1]):
-                start_pos.append(cls_idx)
-                end_pos.append(cls_idx)
-                continue
-            st = c0
-            while st <= c1 and offsets[st][0] <= s_char:
-                st += 1
-            en = c1
-            while en >= c0 and offsets[en][1] >= e_char:
-                en -= 1
-            # Fixed position calculation with bounds checking
-            start_pos.append(max(c0, min(st - 1, c1)))
-            end_pos.append(max(c0, min(en + 1, c1)))
-        tok_batch["start_positions"] = start_pos
-        tok_batch["end_positions"] = end_pos
-        # Store sample mapping for metrics calculation
-        tok_batch["sample_mapping"] = sample_map
-        return tok_batch
-    # Tokenize datasets
-    print("🔄 Tokenizing datasets...")
-    try:
-        train_tok = train_ds.map(
-            preprocess, batched=True, batch_size=50,
-            remove_columns=train_ds.column_names,
-            desc="Tokenizing train"
-        )
-        val_tok = val_ds.map(
-            preprocess, batched=True, batch_size=50,
-            remove_columns=val_ds.column_names,
-            desc="Tokenizing validation"
-        )
-    except Exception as e:
-        print(f"❌ Tokenization failed: {e}")
-        return
-    # DEBUG: Print actual dataset sizes after tokenization
-    print(f"🔍 DEBUG INFO:")
-    print(f"   Original samples: {N}")
-    print(f"   After tokenization - Train: {len(train_tok)}, Val: {len(val_tok)}")
-    print(f"   Expansion factor: {len(train_tok)/len(train_ds):.1f}x")
-    # SAFETY CHECK: If expansion is too high, reduce data size automatically
-    expansion_factor = len(train_tok) / len(train_ds)
-    if expansion_factor > 12:  # Slightly more permissive for 4K samples
-        print(f"⚠️ HIGH EXPANSION DETECTED ({expansion_factor:.1f}x)!")
-        print("🔧 Auto-reducing dataset size to prevent excessively slow training...")
-        # Allow up to 20k samples for 1 hour training
-        target_size = min(20000, len(train_tok))  # Max 20k samples
-        train_indices = list(range(0, len(train_tok), max(1, len(train_tok) // target_size)))[:target_size]
-        val_indices = list(range(0, len(val_tok), max(1, len(val_tok) // (target_size // 10))))[:target_size // 10]
-        train_tok = train_tok.select(train_indices)
-        val_tok = val_tok.select(val_indices)
-        print(f"✅ Reduced to - Train: {len(train_tok)}, Val: {len(val_tok)}")
-        print(f"📈 This should complete in ~45-75 minutes")
-    # Clean up memory
-    del raw, ds, train_ds, val_ds
-    gc.collect()
-    torch.cuda.empty_cache()
-    # FIXED: Metrics setup with proper error handling
-    try:
-        metric = evaluate.load("squad")
-    except Exception as e:
-        print(f"⚠️ Failed to load SQuAD metric: {e}")
-        metric = None
-    def compute_metrics(eval_pred):
-        if metric is None:
-            print("⚠️ No metric available, returning dummy scores")
-            return {"exact_match": 0.0, "f1": 0.0}
-        try:
-            preds, _ = eval_pred
-            starts, ends = preds
-            # Group predictions by original sample (handle multiple chunks per sample)
-            sample_predictions = {}
-            for i in range(len(starts)):
-                # FIXED: Proper dictionary access without hasattr
-                if 'sample_mapping' in val_tok[i]:
-                    orig_idx = val_tok[i]['sample_mapping']
-                else:
-                    # Fallback: assume 1:1 mapping (may be inaccurate with chunking)
-                    orig_idx = min(i, len(original_val_data) - 1)
-                # Get best answer span for this chunk
-                start_idx = int(np.argmax(starts[i]))
-                end_idx = int(np.argmax(ends[i]))
-                if start_idx > end_idx:
-                    start_idx, end_idx = end_idx, start_idx
-                # Extract answer text
-                try:
-                    answer_text = tok.decode(
-                        val_tok[i]["input_ids"][start_idx:end_idx+1],
-                        skip_special_tokens=True
-                    ).strip()
-                except Exception:
-                    answer_text = ""
-                # Store best prediction for this original sample
-                confidence = float(starts[i][start_idx]) + float(ends[i][end_idx])
-                if orig_idx not in sample_predictions or confidence > sample_predictions[orig_idx][1]:
-                    sample_predictions[orig_idx] = (answer_text, confidence)
-            # Format for SQuAD metric
-            predictions = []
-            references = []
-            for orig_idx in range(len(original_val_data)):
-                pred_text = sample_predictions.get(orig_idx, ("", 0))[0]
-                predictions.append({
-                    "id": str(orig_idx),
-                    "prediction_text": pred_text
-                })
-                references.append({
-                    "id": str(orig_idx),
-                    "answers": original_val_data[orig_idx]
-                })
-            result = metric.compute(predictions=predictions, references=references)
-            # Add some debugging info
-            print(f"📊 Evaluation: EM={result['exact_match']:.3f}, F1={result['f1']:.3f}")
-            return result
-        except Exception as e:
-            print(f"⚠️ Metrics computation failed: {e}")
-            print(f"   Pred shape: {np.array(preds).shape if preds else 'None'}")
-            print(f"   Val dataset size: {len(val_tok)}")
-            print(f"   Original val size: {len(original_val_data)}")
-            return {"exact_match": 0.0, "f1": 0.0}
-    # OPTIMIZED Training arguments
-    output_dir = "./model_output"
     args = TrainingArguments(
-        output_dir=output_dir,
-        per_device_train_batch_size=8,    # INCREASED from 2
-        per_device_eval_batch_size=8,     # INCREASED from 4
-        gradient_accumulation_steps=2,    # REDUCED from 8
-        num_train_epochs=3,               # 3 epochs for good training
-        learning_rate=5e-4,
-        lr_scheduler_type="cosine",
-        warmup_ratio=0.1,
-        bf16=True,                        # Better for newer GPUs
-        eval_strategy="steps",
-        eval_steps=100,                   # More frequent evaluation
-        save_steps=200,                   # More frequent saving
         save_total_limit=2,
-        logging_steps=25,                 # More frequent logging
         weight_decay=0.01,
-        remove_unused_columns=True,
-        report_to=None,
-        push_to_hub=False,                # We'll push manually
-        dataloader_pin_memory=True,       # Faster data loading
-        dataloader_num_workers=4,         # Parallel data loading
-        gradient_checkpointing=False,     # Trade memory for speed
-        load_best_model_at_end=True,      # Load best model
-        metric_for_best_model="f1",       # Use F1 score
         greater_is_better=True,
     )
-    # Create trainer
-    trainer = Trainer(
         model=model,
         args=args,
-        train_dataset=train_tok,
-        eval_dataset=val_tok,
         tokenizer=tok,
         data_collator=default_data_collator,
         compute_metrics=compute_metrics,
     )
-    print(f"🚀 Starting training...")
-    print(f"📊 Total training samples: {len(train_tok)}")
-    print(f"📊 Total validation samples: {len(val_tok)}")
-    print(f"⚡ Effective batch size: {args.per_device_train_batch_size * args.gradient_accumulation_steps}")
-    if torch.cuda.is_available():
-        print(f"💾 GPU memory before training: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
-    # Training loop with error handling
-    try:
-        trainer.train()
-        print("✅ Training completed successfully!")
-    except RuntimeError as e:
-        if "CUDA out of memory" in str(e):
-            print("⚠️ GPU OOM - reducing batch size and retrying...")
-            torch.cuda.empty_cache()
-            gc.collect()
-            # Reduce batch size
-            args.per_device_train_batch_size = 4
-            args.gradient_accumulation_steps = 4
-            trainer = Trainer(
-                model=model, args=args,
-                train_dataset=train_tok, eval_dataset=val_tok,
-                tokenizer=tok, data_collator=default_data_collator,
-                compute_metrics=compute_metrics,
-            )
-            trainer.train()
-            print("✅ Training completed with reduced batch size!")
-        else:
-            print(f"❌ Training failed: {e}")
-            raise e
-    except Exception as e:
-        print(f"❌ Unexpected training error: {e}")
-        return
-    # Save model locally first
-    print("💾 Saving model locally...")
-    try:
-        os.makedirs(output_dir, exist_ok=True)
-        trainer.model.save_pretrained(output_dir)
-        tok.save_pretrained(output_dir)
-        print("✅ Model saved locally")
-    except Exception as e:
-        print(f"❌ Failed to save model locally: {e}")
-        return
-    # Save training info
-    training_info = {
-        "model_name": model_name,
-        "base_model": base_model,
-        "dataset": "theatticusproject/cuad-qa",
-        "original_samples": N,
-        "training_samples_after_tokenization": len(train_tok),
-        "validation_samples_after_tokenization": len(val_tok),
-        "lora_config": {
-            "r": lora_cfg.r,
-            "lora_alpha": lora_cfg.lora_alpha,
-            "target_modules": lora_cfg.target_modules,
-            "lora_dropout": lora_cfg.lora_dropout,
-        },
-        "training_args": {
-            "batch_size": args.per_device_train_batch_size,
-            "gradient_accumulation_steps": args.gradient_accumulation_steps,
-            "effective_batch_size": args.per_device_train_batch_size * args.gradient_accumulation_steps,
-            "epochs": args.num_train_epochs,
-            "learning_rate": args.learning_rate,
-        }
-    }
-    try:
-        with open(os.path.join(output_dir, "training_info.json"), "w") as f:
-            json.dump(training_info, f, indent=2)
-    except Exception as e:
-        print(f"⚠️ Failed to save training info: {e}")
-    # Push to Hub if token available
-    if hf_token:
-        try:
-            print(f"⬆️ Pushing model to Hub: {model_name}")
-            trainer.model.push_to_hub(model_name, private=False)
-            tok.push_to_hub(model_name, private=False)
-            # Also push training info
-            try:
-                from huggingface_hub import upload_file
-                upload_file(
-                    path_or_fileobj=os.path.join(output_dir, "training_info.json"),
-                    path_in_repo="training_info.json",
-                    repo_id=model_name,
-                    repo_type="model"
-                )
-                print("📊 Training info uploaded")
-            except Exception as e:
-                print(f"⚠️ Training info upload failed: {e}")
-            print(f"🎉 Model successfully saved to: https://huggingface.co/{model_name}")
-        except Exception as e:
-            print(f"❌ Failed to push to Hub: {e}")
-            print("💾 Model saved locally in ./model_output/")
-    else:
-        print("💾 Model saved locally in ./model_output/ (no HF token for Hub upload)")
-    print("🏁 Training pipeline completed!")
 if __name__ == "__main__":
-    main()

+#!/usr/bin/env python
+# train_cuad_lora.py
+"""
+CUAD fine-tune with LoRA on an L4 / T4 GPU.
+Expected wall-clock on Nvidia L4: ~25-30 min.
+"""
+import os, json, random, gc
+from collections import defaultdict
+import torch, numpy as np
+from datasets import load_dataset, Dataset, disable_caching
 from transformers import (
     AutoTokenizer, AutoModelForQuestionAnswering,
+    TrainingArguments, default_data_collator
 )
+from transformers import QuestionAnsweringTrainer, EvalPrediction
 from peft import LoraConfig, get_peft_model, TaskType
+import evaluate
 from huggingface_hub import login
+disable_caching()                 # avoids giant disk cache on Colab
+# ─────────────────────────────────────────────────────────────── helpers ──
+MAX_LEN     = 384                 # window
+DOC_STRIDE  = 128
+SEED        = 42
+def set_seed(seed):
+    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def balance_has_answer(dataset, ratio=2.0):
+    """Keep all has-answer rows, down-sample no-answer rows to `ratio`."""
+    has, no = [], []
+    for ex in dataset:
+        (has if ex["answers"]["text"] else no).append(ex)
+    k = int(len(has) * ratio)
+    no = random.sample(no, min(k, len(no)))
+    return Dataset.from_list(has + no)
+# ────────────────────────────────────────────────────────────── postproc ──
+metric = evaluate.load("squad")
+def postprocess_qa(examples, features, raw_predictions, tokenizer):
+    """HF-style span extraction + n-best, returns SQuAD format dict."""
+    all_start, all_end = raw_predictions
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = defaultdict(list)
+    for i, feat_id in enumerate(features["example_id"]):
+        features_per_example[example_id_to_index[feat_id]].append(i)
+    predictions = []
+    for example_idx, example in enumerate(examples):
+        best_score = -1e9
+        best_span  = ""
+        context    = example["context"]
+        for feat_idx in features_per_example[example_idx]:
+            start_logit = all_start[feat_idx]
+            end_logit   = all_end[feat_idx]
+            offset      = features["offset_mapping"][feat_idx]
+            start_idx = int(np.argmax(start_logit))
+            end_idx   = int(np.argmax(end_logit))
+            if start_idx <= end_idx < len(offset):
+                start_char, _ = offset[start_idx]
+                _, end_char   = offset[end_idx]
+                span = context[start_char:end_char].strip()
+                score = start_logit[start_idx] + end_logit[end_idx]
+                if score > best_score and span:
+                    best_score, best_span = score, span
+        predictions.append(
+            {"id": example["id"], "prediction_text": best_span}
+        )
+    return predictions
+def compute_metrics(eval_pred: EvalPrediction):
+    predictions = postprocess_qa(raw_val, val_feats, eval_pred.predictions, tok)
+    references  = [
+        {"id": ex["id"], "answers": ex["answers"]} for ex in raw_val
+    ]
+    return metric.compute(predictions=predictions, references=references)
+# ───────────────────────────────────────────────────────────────── main ──
 def main():
+    set_seed(SEED)
+    #  model name to store on Hub
+    model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v2")
+    if (tokn := os.getenv("roberta_token")):
+        try:  login(tokn);  print("🔑 HuggingFace Hub login OK")
+        except Exception as e: print("Hub login failed:", e)
+    print("📚 Loading CUAD…")
+    cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True)
+    cuad = cuad.shuffle(seed=SEED)
+    cuad = balance_has_answer(cuad, ratio=2.0)        #  ≈18 k rows
+    # train / val 90-10
+    ds = cuad.train_test_split(test_size=0.1, seed=SEED)
+    train_raw, val_raw = ds["train"], ds["test"]
+    # ── tokeniser & model (SQuAD-2 tuned) ───────────────────────────────
+    base_ckpt = "deepset/roberta-base-squad2"
+    tok   = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
+    model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
+    # LoRA
+    lora = LoraConfig(
         task_type=TaskType.QUESTION_ANS,
+        r=16, lora_alpha=32, lora_dropout=0.05,
         target_modules=["query", "value"],
     )
+    model = get_peft_model(model, lora)
     model.print_trainable_parameters()
+    # ── preprocess ─────────────────────────────────────────────────────
     def preprocess(examples):
+        return tok(
             examples["question"],
             examples["context"],
             truncation="only_second",
+            max_length=MAX_LEN,
+            stride=DOC_STRIDE,
             return_overflowing_tokens=True,
             return_offsets_mapping=True,
             padding="max_length",
+        ) | { "example_id": examples["id"] }
+    train_feats = train_raw.map(
+        preprocess, batched=True, remove_columns=train_raw.column_names,
+        num_proc=4, desc="tokenise-train"
+    )
+    val_feats = val_raw.map(
+        preprocess, batched=True, remove_columns=val_raw.column_names,
+        num_proc=4, desc="tokenise-val"
+    )
+    global raw_val     # for metric fn
+    raw_val = val_raw
+    # ── training args ──────────────────────────────────────────────────
     args = TrainingArguments(
+        output_dir="./cuad_lora_out",
+        learning_rate=3e-5,
+        num_train_epochs=4,
+        per_device_train_batch_size=8,
+        per_device_eval_batch_size=8,
+        gradient_accumulation_steps=4,      # eff. BS 32
+        fp16=False, bf16=True,              # L4 = bf16
+        evaluation_strategy="steps",
+        eval_steps=250,
+        save_steps=500,
         save_total_limit=2,
         weight_decay=0.01,
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.1,
+        load_best_model_at_end=True,
+        metric_for_best_model="f1",
         greater_is_better=True,
+        logging_steps=50,
+        report_to="none",
     )
+    trainer = QuestionAnsweringTrainer(
         model=model,
         args=args,
+        train_dataset=train_feats,
+        eval_dataset=val_feats,
         tokenizer=tok,
         data_collator=default_data_collator,
         compute_metrics=compute_metrics,
     )
+    print("🚀 Training…")
+    trainer.train()
+    print("✅ Done.  Best F1:", trainer.state.best_metric)
+    trainer.save_model("./cuad_lora_out")
+    tok.save_pretrained("./cuad_lora_out")
+    # optional: push
+    if tokn:
+        trainer.push_to_hub(model_repo, private=False)
+        tok.push_to_hub(model_repo, private=False)
+        print("🚀 Pushed to:", f"https://huggingface.co/{model_repo}")
 if __name__ == "__main__":
+    main()