AvocadoMuffin commited on
Commit
d9f65dd
Β·
verified Β·
1 Parent(s): c5a0569

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +281 -0
train.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, gc, os, numpy as np, evaluate, json
2
+ from datasets import load_dataset
3
+ from transformers import (
4
+ AutoTokenizer, AutoModelForQuestionAnswering,
5
+ TrainingArguments, Trainer, default_data_collator
6
+ )
7
+ from peft import LoraConfig, get_peft_model, TaskType
8
+ from huggingface_hub import login
9
+ import sys
10
+
11
+ def main():
12
+ # Get model name from environment
13
+ model_name = os.environ.get('MODEL_NAME', 'roberta-cuad-qa')
14
+
15
+ # Login to HF Hub
16
+ hf_token = os.environ.get('roberta_token')
17
+ if hf_token:
18
+ login(token=hf_token)
19
+ print("βœ… Logged into Hugging Face Hub")
20
+ else:
21
+ print("⚠️ No HF_TOKEN found - model won't be pushed to Hub")
22
+
23
+ # Setup
24
+ torch.cuda.empty_cache()
25
+ device = "cuda" if torch.cuda.is_available() else "cpu"
26
+ print(f"πŸ”§ Using device: {device}")
27
+
28
+ if torch.cuda.is_available():
29
+ print(f"🎯 GPU: {torch.cuda.get_device_name()}")
30
+ print(f"πŸ’Ύ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
31
+
32
+ # Load and prepare data
33
+ print("πŸ“š Loading CUAD dataset...")
34
+ raw = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True)
35
+
36
+ # Use subset for faster training on free GPU
37
+ N = 2000
38
+ raw = raw.shuffle(seed=42).select(range(min(N, len(raw))))
39
+ ds = raw.train_test_split(test_size=0.1, seed=42)
40
+ train_ds, val_ds = ds["train"], ds["test"]
41
+
42
+ print(f"βœ… Data loaded - Train: {len(train_ds)}, Val: {len(val_ds)}")
43
+
44
+ # Store original validation data for metrics
45
+ original_val_data = [ex["answers"] for ex in val_ds]
46
+
47
+ # Load model and tokenizer
48
+ print("πŸ€– Loading RoBERTa model...")
49
+ base_model = "roberta-base"
50
+ tok = AutoTokenizer.from_pretrained(base_model, use_fast=True)
51
+ model = AutoModelForQuestionAnswering.from_pretrained(base_model)
52
+
53
+ # Add LoRA
54
+ print("πŸ”§ Adding LoRA adapters...")
55
+ lora_cfg = LoraConfig(
56
+ task_type=TaskType.QUESTION_ANS,
57
+ target_modules=["query", "value"],
58
+ r=16,
59
+ lora_alpha=32, # Improved scaling
60
+ lora_dropout=0.05,
61
+ )
62
+ model = get_peft_model(model, lora_cfg)
63
+ model.print_trainable_parameters()
64
+ model.to(device)
65
+
66
+ # Tokenization function
67
+ max_len, doc_stride = 384, 128
68
+
69
+ def preprocess(examples):
70
+ tok_batch = tok(
71
+ examples["question"],
72
+ examples["context"],
73
+ truncation="only_second",
74
+ max_length=max_len,
75
+ stride=doc_stride,
76
+ return_overflowing_tokens=True,
77
+ return_offsets_mapping=True,
78
+ padding="max_length",
79
+ )
80
+
81
+ sample_map = tok_batch.pop("overflow_to_sample_mapping")
82
+ offset_map = tok_batch.pop("offset_mapping")
83
+ start_pos, end_pos = [], []
84
+
85
+ for i, offsets in enumerate(offset_map):
86
+ cls_idx = tok_batch["input_ids"][i].index(tok.cls_token_id)
87
+ sample_idx = sample_map[i]
88
+ answer = examples["answers"][sample_idx]
89
+
90
+ if len(answer["answer_start"]) == 0:
91
+ start_pos.append(cls_idx)
92
+ end_pos.append(cls_idx)
93
+ continue
94
+
95
+ s_char = answer["answer_start"][0]
96
+ e_char = s_char + len(answer["text"][0])
97
+ seq_ids = tok_batch.sequence_ids(i)
98
+ c0, c1 = seq_ids.index(1), len(seq_ids) - 1 - seq_ids[::-1].index(1)
99
+
100
+ if not (offsets[c0][0] <= s_char <= offsets[c1][1]):
101
+ start_pos.append(cls_idx)
102
+ end_pos.append(cls_idx)
103
+ continue
104
+
105
+ st = c0
106
+ while st <= c1 and offsets[st][0] <= s_char:
107
+ st += 1
108
+ en = c1
109
+ while en >= c0 and offsets[en][1] >= e_char:
110
+ en -= 1
111
+
112
+ # Fixed position calculation with bounds checking
113
+ start_pos.append(max(c0, min(st - 1, c1)))
114
+ end_pos.append(max(c0, min(en + 1, c1)))
115
+
116
+ tok_batch["start_positions"] = start_pos
117
+ tok_batch["end_positions"] = end_pos
118
+ return tok_batch
119
+
120
+ # Tokenize datasets
121
+ print("πŸ”„ Tokenizing datasets...")
122
+ train_tok = train_ds.map(
123
+ preprocess, batched=True, batch_size=100,
124
+ remove_columns=train_ds.column_names,
125
+ desc="Tokenizing train"
126
+ )
127
+ val_tok = val_ds.map(
128
+ preprocess, batched=True, batch_size=100,
129
+ remove_columns=val_ds.column_names,
130
+ desc="Tokenizing validation"
131
+ )
132
+
133
+ # Clean up memory
134
+ del raw, ds, train_ds, val_ds
135
+ gc.collect()
136
+ torch.cuda.empty_cache()
137
+
138
+ # Metrics setup
139
+ metric = evaluate.load("squad")
140
+
141
+ def postprocess(preds, dataset):
142
+ starts, ends = preds
143
+ answers = []
144
+ for i in range(len(starts)):
145
+ a, b = int(np.argmax(starts[i])), int(np.argmax(ends[i]))
146
+ if a > b: a, b = b, a
147
+ text = tok.decode(dataset[i]["input_ids"][a:b+1], skip_special_tokens=True)
148
+ answers.append(text.strip())
149
+ return answers
150
+
151
+ def compute_metrics(eval_pred):
152
+ try:
153
+ preds, _ = eval_pred
154
+ texts = postprocess(preds, val_tok)
155
+ predictions = [{"id": str(i), "prediction_text": t} for i, t in enumerate(texts)]
156
+ references = [{"id": str(i), "answers": ans} for i, ans in enumerate(original_val_data)]
157
+ return metric.compute(predictions=predictions, references=references)
158
+ except Exception as e:
159
+ print(f"⚠️ Metrics computation failed: {e}")
160
+ return {"exact_match": 0.0, "f1": 0.0}
161
+
162
+ # Training arguments
163
+ output_dir = "./model_output"
164
+ args = TrainingArguments(
165
+ output_dir=output_dir,
166
+ per_device_train_batch_size=2,
167
+ per_device_eval_batch_size=4,
168
+ gradient_accumulation_steps=8,
169
+ num_train_epochs=2,
170
+ learning_rate=5e-4,
171
+ lr_scheduler_type="cosine",
172
+ warmup_ratio=0.1,
173
+ fp16=True,
174
+ eval_strategy="steps",
175
+ eval_steps=250,
176
+ save_steps=500,
177
+ save_total_limit=2,
178
+ logging_steps=50,
179
+ weight_decay=0.01,
180
+ remove_unused_columns=True,
181
+ report_to=None,
182
+ push_to_hub=False, # We'll do this manually
183
+ dataloader_pin_memory=False, # Save memory
184
+ )
185
+
186
+ # Create trainer
187
+ trainer = Trainer(
188
+ model=model,
189
+ args=args,
190
+ train_dataset=train_tok,
191
+ eval_dataset=val_tok,
192
+ tokenizer=tok,
193
+ data_collator=default_data_collator,
194
+ compute_metrics=compute_metrics,
195
+ )
196
+
197
+ print(f"πŸš€ Starting training...")
198
+ print(f"πŸ“Š Total training samples: {len(train_tok)}")
199
+ print(f"πŸ“Š Total validation samples: {len(val_tok)}")
200
+
201
+ if torch.cuda.is_available():
202
+ print(f"πŸ’Ύ GPU memory before training: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
203
+
204
+ # Training loop with error handling
205
+ try:
206
+ trainer.train()
207
+ print("βœ… Training completed successfully!")
208
+
209
+ except RuntimeError as e:
210
+ if "CUDA out of memory" in str(e):
211
+ print("⚠️ GPU OOM - reducing batch size and retrying...")
212
+ torch.cuda.empty_cache()
213
+ gc.collect()
214
+
215
+ # Reduce batch size
216
+ args.per_device_train_batch_size = 1
217
+ args.gradient_accumulation_steps = 16
218
+
219
+ trainer = Trainer(
220
+ model=model, args=args,
221
+ train_dataset=train_tok, eval_dataset=val_tok,
222
+ tokenizer=tok, data_collator=default_data_collator,
223
+ compute_metrics=compute_metrics,
224
+ )
225
+ trainer.train()
226
+ print("βœ… Training completed with reduced batch size!")
227
+ else:
228
+ raise e
229
+
230
+ # Save model locally first
231
+ print("πŸ’Ύ Saving model locally...")
232
+ os.makedirs(output_dir, exist_ok=True)
233
+ trainer.model.save_pretrained(output_dir)
234
+ tok.save_pretrained(output_dir)
235
+
236
+ # Save training info
237
+ training_info = {
238
+ "model_name": model_name,
239
+ "base_model": base_model,
240
+ "dataset": "theatticusproject/cuad-qa",
241
+ "lora_config": {
242
+ "r": lora_cfg.r,
243
+ "lora_alpha": lora_cfg.lora_alpha,
244
+ "target_modules": lora_cfg.target_modules,
245
+ "lora_dropout": lora_cfg.lora_dropout,
246
+ },
247
+ "training_samples": len(train_tok),
248
+ "validation_samples": len(val_tok),
249
+ }
250
+
251
+ with open(os.path.join(output_dir, "training_info.json"), "w") as f:
252
+ json.dump(training_info, f, indent=2)
253
+
254
+ # Push to Hub if token available
255
+ if hf_token:
256
+ try:
257
+ print(f"⬆️ Pushing model to Hub: {model_name}")
258
+ trainer.model.push_to_hub(model_name, private=False)
259
+ tok.push_to_hub(model_name, private=False)
260
+
261
+ # Also push training info
262
+ from huggingface_hub import upload_file
263
+ upload_file(
264
+ path_or_fileobj=os.path.join(output_dir, "training_info.json"),
265
+ path_in_repo="training_info.json",
266
+ repo_id=model_name,
267
+ repo_type="model"
268
+ )
269
+
270
+ print(f"πŸŽ‰ Model successfully saved to: https://huggingface.co/{model_name}")
271
+
272
+ except Exception as e:
273
+ print(f"❌ Failed to push to Hub: {e}")
274
+ print("πŸ’Ύ Model saved locally in ./model_output/")
275
+ else:
276
+ print("πŸ’Ύ Model saved locally in ./model_output/ (no HF token for Hub upload)")
277
+
278
+ print("🏁 Training pipeline completed!")
279
+
280
+ if __name__ == "__main__":
281
+ main()