import torch, gc, os, numpy as np, evaluate, json from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator ) from peft import LoraConfig, get_peft_model, TaskType from huggingface_hub import login import sys def main(): # Get model name from environment model_name = os.environ.get('MODEL_NAME', 'roberta-cuad-qa') # Login to HF Hub hf_token = os.environ.get('roberta_token') if hf_token: login(token=hf_token) print("✅ Logged into Hugging Face Hub") else: print("⚠️ No HF_TOKEN found - model won't be pushed to Hub") # Setup torch.cuda.empty_cache() device = "cuda" if torch.cuda.is_available() else "cpu" print(f"🔧 Using device: {device}") if torch.cuda.is_available(): print(f"🎯 GPU: {torch.cuda.get_device_name()}") print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB") # Load and prepare data - REDUCED SIZE FOR FASTER TRAINING print("📚 Loading CUAD dataset...") raw = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True) # Use 5000 samples for good model quality - expect ~1 hour training N = 5000 raw = raw.shuffle(seed=42).select(range(min(N, len(raw)))) ds = raw.train_test_split(test_size=0.1, seed=42) train_ds, val_ds = ds["train"], ds["test"] print(f"✅ Data loaded - Train: {len(train_ds)}, Val: {len(val_ds)}") # Store original validation data for metrics print("📊 Preparing metrics data...") original_val_data = [] val_sample_mapping = [] # Track which tokenized sample maps to which original for i, ex in enumerate(val_ds): original_val_data.append(ex["answers"]) # Load model and tokenizer print("🤖 Loading RoBERTa model...") base_model = "roberta-base" tok = AutoTokenizer.from_pretrained(base_model, use_fast=True) model = AutoModelForQuestionAnswering.from_pretrained(base_model) # Add LoRA print("🔧 Adding LoRA adapters...") lora_cfg = LoraConfig( task_type=TaskType.QUESTION_ANS, target_modules=["query", "value"], r=16, lora_alpha=32, lora_dropout=0.05, ) model = get_peft_model(model, lora_cfg) model.print_trainable_parameters() model.to(device) # Tokenization function - AGGRESSIVE OPTIMIZATION TO PREVENT EXPANSION max_len, doc_stride = 512, 400 # Much larger stride to minimize chunks per document def preprocess(examples): tok_batch = tok( examples["question"], examples["context"], truncation="only_second", max_length=max_len, stride=doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length", ) sample_map = tok_batch.pop("overflow_to_sample_mapping") offset_map = tok_batch.pop("offset_mapping") start_pos, end_pos = [], [] for i, offsets in enumerate(offset_map): cls_idx = tok_batch["input_ids"][i].index(tok.cls_token_id) sample_idx = sample_map[i] answer = examples["answers"][sample_idx] if len(answer["answer_start"]) == 0: start_pos.append(cls_idx) end_pos.append(cls_idx) continue s_char = answer["answer_start"][0] e_char = s_char + len(answer["text"][0]) seq_ids = tok_batch.sequence_ids(i) c0, c1 = seq_ids.index(1), len(seq_ids) - 1 - seq_ids[::-1].index(1) if not (offsets[c0][0] <= s_char <= offsets[c1][1]): start_pos.append(cls_idx) end_pos.append(cls_idx) continue st = c0 while st <= c1 and offsets[st][0] <= s_char: st += 1 en = c1 while en >= c0 and offsets[en][1] >= e_char: en -= 1 # Fixed position calculation with bounds checking start_pos.append(max(c0, min(st - 1, c1))) end_pos.append(max(c0, min(en + 1, c1))) tok_batch["start_positions"] = start_pos tok_batch["end_positions"] = end_pos # Store sample mapping for metrics calculation tok_batch["sample_mapping"] = sample_map return tok_batch # Tokenize datasets print("🔄 Tokenizing datasets...") train_tok = train_ds.map( preprocess, batched=True, batch_size=50, # Smaller batch size for preprocessing remove_columns=train_ds.column_names, desc="Tokenizing train" ) val_tok = val_ds.map( preprocess, batched=True, batch_size=50, remove_columns=val_ds.column_names, desc="Tokenizing validation" ) # DEBUG: Print actual dataset sizes after tokenization print(f"🔍 DEBUG INFO:") print(f" Original samples: {N}") print(f" After tokenization - Train: {len(train_tok)}, Val: {len(val_tok)}") print(f" Expansion factor: {len(train_tok)/len(train_ds):.1f}x") # SAFETY CHECK: If expansion is too high, reduce data size automatically expansion_factor = len(train_tok) / len(train_ds) if expansion_factor > 12: # Slightly more permissive for 4K samples print(f"⚠️ HIGH EXPANSION DETECTED ({expansion_factor:.1f}x)!") print("🔧 Auto-reducing dataset size to prevent excessively slow training...") # Allow up to 20k samples for 1 hour training target_size = min(20000, len(train_tok)) # Max 20k samples train_indices = list(range(0, len(train_tok), max(1, len(train_tok) // target_size)))[:target_size] val_indices = list(range(0, len(val_tok), max(1, len(val_tok) // (target_size // 10))))[:target_size // 10] train_tok = train_tok.select(train_indices) val_tok = val_tok.select(val_indices) print(f"✅ Reduced to - Train: {len(train_tok)}, Val: {len(val_tok)}") print(f"📈 This should complete in ~45-75 minutes") # Clean up memory del raw, ds, train_ds, val_ds gc.collect() torch.cuda.empty_cache() # Metrics setup metric = evaluate.load("squad") def postprocess(preds, dataset): starts, ends = preds answers = [] for i in range(len(starts)): a, b = int(np.argmax(starts[i])), int(np.argmax(ends[i])) if a > b: a, b = b, a text = tok.decode(dataset[i]["input_ids"][a:b+1], skip_special_tokens=True) answers.append(text.strip()) return answers def compute_metrics(eval_pred): try: preds, _ = eval_pred starts, ends = preds # Group predictions by original sample (handle multiple chunks per sample) sample_predictions = {} for i in range(len(starts)): # Get which original sample this tokenized example came from if hasattr(val_tok[i], 'sample_mapping') and 'sample_mapping' in val_tok[i]: orig_idx = val_tok[i]['sample_mapping'] else: # Fallback: assume 1:1 mapping (may be inaccurate with chunking) orig_idx = min(i, len(original_val_data) - 1) # Get best answer span for this chunk start_idx = int(np.argmax(starts[i])) end_idx = int(np.argmax(ends[i])) if start_idx > end_idx: start_idx, end_idx = end_idx, start_idx # Extract answer text answer_text = tok.decode( val_tok[i]["input_ids"][start_idx:end_idx+1], skip_special_tokens=True ).strip() # Store best prediction for this original sample confidence = float(starts[i][start_idx]) + float(ends[i][end_idx]) if orig_idx not in sample_predictions or confidence > sample_predictions[orig_idx][1]: sample_predictions[orig_idx] = (answer_text, confidence) # Format for SQuAD metric predictions = [] references = [] for orig_idx in range(len(original_val_data)): pred_text = sample_predictions.get(orig_idx, ("", 0))[0] predictions.append({ "id": str(orig_idx), "prediction_text": pred_text }) references.append({ "id": str(orig_idx), "answers": original_val_data[orig_idx] }) result = metric.compute(predictions=predictions, references=references) # Add some debugging info print(f"📊 Evaluation: EM={result['exact_match']:.3f}, F1={result['f1']:.3f}") return result except Exception as e: print(f"⚠️ Metrics computation failed: {e}") print(f" Pred shape: {np.array(preds).shape if preds else 'None'}") print(f" Val dataset size: {len(val_tok)}") print(f" Original val size: {len(original_val_data)}") return {"exact_match": 0.0, "f1": 0.0} # OPTIMIZED Training arguments output_dir = "./model_output" args = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=8, # INCREASED from 2 per_device_eval_batch_size=8, # INCREASED from 4 gradient_accumulation_steps=2, # REDUCED from 8 num_train_epochs=3, # Back to 3 epochs for better training learning_rate=5e-4, lr_scheduler_type="cosine", warmup_ratio=0.1, bf16=True, # CHANGED from fp16 (better for newer GPUs) eval_strategy="steps", eval_steps=100, # REDUCED from 250 save_steps=200, # REDUCED from 500 save_total_limit=2, logging_steps=25, # REDUCED from 50 weight_decay=0.01, remove_unused_columns=True, report_to=None, push_to_hub=False, dataloader_pin_memory=True, # CHANGED to True for faster data loading dataloader_num_workers=4, # ADDED for parallel data loading gradient_checkpointing=False, # DISABLED to trade memory for speed ) # Create trainer trainer = Trainer( model=model, args=args, train_dataset=train_tok, eval_dataset=val_tok, tokenizer=tok, data_collator=default_data_collator, compute_metrics=compute_metrics, ) print(f"🚀 Starting training...") print(f"📊 Total training samples: {len(train_tok)}") print(f"📊 Total validation samples: {len(val_tok)}") print(f"⚡ Effective batch size: {args.per_device_train_batch_size * args.gradient_accumulation_steps}") if torch.cuda.is_available(): print(f"💾 GPU memory before training: {torch.cuda.memory_allocated()/1024**3:.2f} GB") # Training loop with error handling try: trainer.train() print("✅ Training completed successfully!") except RuntimeError as e: if "CUDA out of memory" in str(e): print("⚠️ GPU OOM - reducing batch size and retrying...") torch.cuda.empty_cache() gc.collect() # Reduce batch size args.per_device_train_batch_size = 4 args.gradient_accumulation_steps = 4 trainer = Trainer( model=model, args=args, train_dataset=train_tok, eval_dataset=val_tok, tokenizer=tok, data_collator=default_data_collator, compute_metrics=compute_metrics, ) trainer.train() print("✅ Training completed with reduced batch size!") else: raise e # Save model locally first print("💾 Saving model locally...") os.makedirs(output_dir, exist_ok=True) trainer.model.save_pretrained(output_dir) tok.save_pretrained(output_dir) # Save training info training_info = { "model_name": model_name, "base_model": base_model, "dataset": "theatticusproject/cuad-qa", "original_samples": N, "training_samples_after_tokenization": len(train_tok), "validation_samples_after_tokenization": len(val_tok), "lora_config": { "r": lora_cfg.r, "lora_alpha": lora_cfg.lora_alpha, "target_modules": lora_cfg.target_modules, "lora_dropout": lora_cfg.lora_dropout, }, "training_args": { "batch_size": args.per_device_train_batch_size, "gradient_accumulation_steps": args.gradient_accumulation_steps, "effective_batch_size": args.per_device_train_batch_size * args.gradient_accumulation_steps, "epochs": args.num_train_epochs, "learning_rate": args.learning_rate, } } with open(os.path.join(output_dir, "training_info.json"), "w") as f: json.dump(training_info, f, indent=2) # Push to Hub if token available if hf_token: try: print(f"⬆️ Pushing model to Hub: {model_name}") trainer.model.push_to_hub(model_name, private=False) tok.push_to_hub(model_name, private=False) # Also push training info from huggingface_hub import upload_file upload_file( path_or_fileobj=os.path.join(output_dir, "training_info.json"), path_in_repo="training_info.json", repo_id=model_name, repo_type="model" ) print(f"🎉 Model successfully saved to: https://huggingface.co/{model_name}") except Exception as e: print(f"❌ Failed to push to Hub: {e}") print("💾 Model saved locally in ./model_output/") else: print("💾 Model saved locally in ./model_output/ (no HF token for Hub upload)") print("🏁 Training pipeline completed!") if __name__ == "__main__": main()