AvocadoMuffin commited on
Commit
bad80b7
Β·
verified Β·
1 Parent(s): 945a2ea

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +62 -75
train.py CHANGED
@@ -1,9 +1,7 @@
1
  #!/usr/bin/env python
2
- # train_cuad_lora_efficient.py
3
  """
4
- CUAD fine-tune with LoRA - Efficient batch processing version.
5
- Fixes bottlenecks and uses proper batching instead of chunking.
6
- GUARANTEED FIX for offset_mapping error AND metric computation issues.
7
  """
8
 
9
  import os, json, random, gc, time
@@ -27,14 +25,14 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
27
 
28
  # ─────────────────────────────────────────────────────────────── config ──
29
 
30
- MAX_LEN = 384
31
- DOC_STRIDE = 128
32
  SEED = 42
33
  BATCH_SIZE = 1000 # Process in larger, more efficient batches
34
 
35
- # Reduced dataset size option
36
- USE_SUBSET = True # Set to True to use only 10k examples
37
- SUBSET_SIZE = 10000
38
 
39
  def set_seed(seed):
40
  random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
@@ -48,17 +46,29 @@ def balance_has_answer(dataset, ratio=2.0, max_samples=None):
48
 
49
  print(f"πŸ“Š Original: {len(has)} has-answer, {len(no)} no-answer")
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  k = int(len(has) * ratio)
52
- no = random.sample(no, min(k, len(no)))
 
53
 
54
  balanced = has + no
 
55
 
56
- # Apply subset limit if specified
57
- if max_samples and len(balanced) > max_samples:
58
- balanced = random.sample(balanced, max_samples)
59
- print(f"πŸ“‰ Reduced to {max_samples} samples for faster training")
60
-
61
- print(f"πŸ“Š Balanced: {len([x for x in balanced if x['answers']['text']])} has-answer, {len([x for x in balanced if not x['answers']['text']])} no-answer")
62
 
63
  return Dataset.from_list(balanced)
64
 
@@ -105,13 +115,10 @@ def postprocess_qa(examples, features, raw_predictions, tokenizer):
105
  # ───────────────────────────────────────────────────────────── preprocessing ──
106
 
107
  def preprocess_training_batch(examples, tokenizer):
108
- """
109
- Training preprocessing - NO offset_mapping included
110
- """
111
  questions = examples["question"]
112
  contexts = examples["context"]
113
 
114
- # Batch tokenization
115
  tokenized_examples = tokenizer(
116
  questions,
117
  contexts,
@@ -119,37 +126,30 @@ def preprocess_training_batch(examples, tokenizer):
119
  max_length=MAX_LEN,
120
  stride=DOC_STRIDE,
121
  return_overflowing_tokens=True,
122
- return_offsets_mapping=True, # We need this temporarily for position calculation
123
  padding="max_length",
124
  )
125
 
126
- # Map back to original examples
127
  sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
128
- offset_mapping = tokenized_examples.pop("offset_mapping") # Remove it immediately after use
129
 
130
- # Initialize output
131
  start_positions = []
132
  end_positions = []
133
 
134
  for i, offsets in enumerate(offset_mapping):
135
- cls_index = 0 # CLS token position
136
-
137
- # Get the original example for this tokenized chunk
138
  sample_index = sample_mapping[i]
139
  answers = examples["answers"][sample_index]
140
 
141
- # Handle cases with no answer
142
  if not answers["text"] or not answers["text"][0]:
143
  start_positions.append(cls_index)
144
  end_positions.append(cls_index)
145
  continue
146
 
147
- # Find answer span in tokens
148
  answer_start_char = answers["answer_start"][0]
149
  answer_text = answers["text"][0]
150
  answer_end_char = answer_start_char + len(answer_text)
151
 
152
- # Find token positions
153
  token_start_index = cls_index
154
  token_end_index = cls_index
155
 
@@ -160,7 +160,6 @@ def preprocess_training_batch(examples, tokenizer):
160
  token_end_index = token_index
161
  break
162
 
163
- # Validate positions
164
  if token_start_index <= token_end_index and token_start_index > 0:
165
  start_positions.append(token_start_index)
166
  end_positions.append(token_end_index)
@@ -171,17 +170,13 @@ def preprocess_training_batch(examples, tokenizer):
171
  tokenized_examples["start_positions"] = start_positions
172
  tokenized_examples["end_positions"] = end_positions
173
 
174
- # NO offset_mapping or example_id for training
175
  return tokenized_examples
176
 
177
  def preprocess_validation_batch(examples, tokenizer):
178
- """
179
- Validation preprocessing - INCLUDES offset_mapping and example_id for post-processing
180
- """
181
  questions = examples["question"]
182
  contexts = examples["context"]
183
 
184
- # Batch tokenization
185
  tokenized_examples = tokenizer(
186
  questions,
187
  contexts,
@@ -193,21 +188,16 @@ def preprocess_validation_batch(examples, tokenizer):
193
  padding="max_length",
194
  )
195
 
196
- # Map back to original examples
197
  sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
198
 
199
- # Add example IDs for evaluation
200
  tokenized_examples["example_id"] = [
201
  examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
202
  ]
203
 
204
- # Keep offset_mapping for post-processing
205
  return tokenized_examples
206
 
207
  def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_training=True):
208
- """
209
- Process dataset in batches using HuggingFace's map function with batching.
210
- """
211
  print(f"πŸ”„ {desc} dataset with batch processing...")
212
 
213
  if is_training:
@@ -232,8 +222,7 @@ def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_train
232
  def main():
233
  set_seed(SEED)
234
 
235
- # Model name to store on Hub
236
- model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v3")
237
 
238
  if (tokn := os.getenv("roberta_token")):
239
  try:
@@ -253,70 +242,70 @@ def main():
253
 
254
  cuad = cuad.shuffle(seed=SEED)
255
 
256
- # Apply subset reduction if enabled
257
  subset_size = SUBSET_SIZE if USE_SUBSET else None
258
- cuad = balance_has_answer(cuad, ratio=2.0, max_samples=subset_size)
259
  print(f"πŸ“Š Final dataset size: {len(cuad)} examples")
260
 
261
- # train / val 90-10
 
 
 
 
262
  ds = cuad.train_test_split(test_size=0.1, seed=SEED)
263
  train_raw, val_raw = ds["train"], ds["test"]
264
 
265
- # ── tokeniser & model (SQuAD-2 tuned) ───────────────────────────────
266
  base_ckpt = "deepset/roberta-base-squad2"
267
  tok = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
268
  model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
269
 
270
- # LoRA with slightly more aggressive settings for smaller dataset
271
  lora = LoraConfig(
272
  task_type=TaskType.QUESTION_ANS,
273
- r=32, lora_alpha=64, lora_dropout=0.1,
274
- target_modules=["query", "value", "key", "dense"],
 
 
275
  )
276
  model = get_peft_model(model, lora)
277
  model.print_trainable_parameters()
278
 
279
- # ── efficient preprocessing ─────────────────────────────────────────
280
- print("πŸ”„ Starting efficient preprocessing...")
281
 
282
- # Process training data (NO offset_mapping)
283
  train_feats = preprocess_dataset_streaming(train_raw, tok, "Training", is_training=True)
284
-
285
- # Process validation data (WITH offset_mapping)
286
  val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation", is_training=False)
287
 
288
  print(f"βœ… Preprocessing completed!")
289
  print(f" Training features: {len(train_feats)}")
290
  print(f" Validation features: {len(val_feats)}")
291
- print(f" Training columns: {train_feats.column_names}")
292
- print(f" Validation columns: {val_feats.column_names}")
293
-
294
- # ── No custom compute_metrics - just use loss for monitoring ──
295
 
296
- # ── training args ──────────────
297
- batch_size = 16
298
  gradient_accumulation_steps = 2
299
  effective_batch_size = batch_size * gradient_accumulation_steps
300
 
301
- num_epochs = 4 if USE_SUBSET else 3
302
  steps_per_epoch = len(train_feats) // effective_batch_size
303
  total_steps = steps_per_epoch * num_epochs
304
 
305
- eval_steps = max(50, steps_per_epoch // 4)
306
- save_steps = eval_steps * 2
307
 
308
  print(f"πŸ“Š Training configuration:")
 
309
  print(f" Steps per epoch: {steps_per_epoch}")
310
  print(f" Total steps: {total_steps}")
311
- print(f" Eval steps: {eval_steps}")
312
- print(f" Save steps: {save_steps}")
313
 
314
  args = TrainingArguments(
315
  output_dir="./cuad_lora_out",
316
- learning_rate=5e-5,
317
  num_train_epochs=num_epochs,
318
  per_device_train_batch_size=batch_size,
319
- per_device_eval_batch_size=16,
320
  gradient_accumulation_steps=gradient_accumulation_steps,
321
  fp16=False, bf16=True,
322
  eval_strategy="steps",
@@ -326,10 +315,8 @@ def main():
326
  weight_decay=0.01,
327
  lr_scheduler_type="cosine",
328
  warmup_ratio=0.1,
329
- load_best_model_at_end=False, # Disable to avoid metric dependency
330
- # metric_for_best_model="eval_loss", # Disabled - no best model selection
331
- # greater_is_better=False, # Disabled
332
- logging_steps=25,
333
  report_to="none",
334
  dataloader_num_workers=2,
335
  dataloader_pin_memory=True,
@@ -343,7 +330,7 @@ def main():
343
  eval_dataset=val_feats,
344
  tokenizer=tok,
345
  data_collator=default_data_collator,
346
- compute_metrics=None, # No custom metrics - just use loss
347
  )
348
 
349
  print("πŸš€ Training…")
@@ -364,7 +351,7 @@ def main():
364
  trainer.save_model("./cuad_lora_out")
365
  tok.save_pretrained("./cuad_lora_out")
366
 
367
- # Push to hub with retry logic
368
  if tokn:
369
  for attempt in range(3):
370
  try:
 
1
  #!/usr/bin/env python
2
+ # train_cuad_lora_efficient.py - FIXED VERSION
3
  """
4
+ CUAD fine-tune with LoRA - Fixed for realistic training times
 
 
5
  """
6
 
7
  import os, json, random, gc, time
 
25
 
26
  # ─────────────────────────────────────────────────────────────── config ──
27
 
28
+ MAX_LEN = 512 # Slightly longer context
29
+ DOC_STRIDE = 256 # Larger stride = fewer chunks = faster training
30
  SEED = 42
31
  BATCH_SIZE = 1000 # Process in larger, more efficient batches
32
 
33
+ # Back to reasonable subset size since you've trained 5k before
34
+ USE_SUBSET = True
35
+ SUBSET_SIZE = 7000 # Good middle ground - more than your 5k success
36
 
37
  def set_seed(seed):
38
  random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
 
46
 
47
  print(f"πŸ“Š Original: {len(has)} has-answer, {len(no)} no-answer")
48
 
49
+ # FIXED: Apply max_samples FIRST, then balance
50
+ if max_samples:
51
+ total_available = len(has) + len(no)
52
+ if total_available > max_samples:
53
+ # Sample proportionally from original distribution
54
+ has_ratio = len(has) / total_available
55
+ target_has = int(max_samples * has_ratio)
56
+ target_no = max_samples - target_has
57
+
58
+ has = random.sample(has, min(target_has, len(has)))
59
+ no = random.sample(no, min(target_no, len(no)))
60
+ print(f"πŸ“‰ Pre-balance subset: {len(has)} has-answer, {len(no)} no-answer")
61
+
62
+ # Now balance within the subset
63
  k = int(len(has) * ratio)
64
+ if len(no) > k:
65
+ no = random.sample(no, k)
66
 
67
  balanced = has + no
68
+ random.shuffle(balanced) # Shuffle the final dataset
69
 
70
+ print(f"πŸ“Š Final balanced: {len([x for x in balanced if x['answers']['text']])} has-answer, {len([x for x in balanced if not x['answers']['text']])} no-answer")
71
+ print(f"πŸ“Š Total examples: {len(balanced)}")
 
 
 
 
72
 
73
  return Dataset.from_list(balanced)
74
 
 
115
  # ───────────────────────────────────────────────────────────── preprocessing ──
116
 
117
  def preprocess_training_batch(examples, tokenizer):
118
+ """Training preprocessing - NO offset_mapping included"""
 
 
119
  questions = examples["question"]
120
  contexts = examples["context"]
121
 
 
122
  tokenized_examples = tokenizer(
123
  questions,
124
  contexts,
 
126
  max_length=MAX_LEN,
127
  stride=DOC_STRIDE,
128
  return_overflowing_tokens=True,
129
+ return_offsets_mapping=True,
130
  padding="max_length",
131
  )
132
 
 
133
  sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
134
+ offset_mapping = tokenized_examples.pop("offset_mapping")
135
 
 
136
  start_positions = []
137
  end_positions = []
138
 
139
  for i, offsets in enumerate(offset_mapping):
140
+ cls_index = 0
 
 
141
  sample_index = sample_mapping[i]
142
  answers = examples["answers"][sample_index]
143
 
 
144
  if not answers["text"] or not answers["text"][0]:
145
  start_positions.append(cls_index)
146
  end_positions.append(cls_index)
147
  continue
148
 
 
149
  answer_start_char = answers["answer_start"][0]
150
  answer_text = answers["text"][0]
151
  answer_end_char = answer_start_char + len(answer_text)
152
 
 
153
  token_start_index = cls_index
154
  token_end_index = cls_index
155
 
 
160
  token_end_index = token_index
161
  break
162
 
 
163
  if token_start_index <= token_end_index and token_start_index > 0:
164
  start_positions.append(token_start_index)
165
  end_positions.append(token_end_index)
 
170
  tokenized_examples["start_positions"] = start_positions
171
  tokenized_examples["end_positions"] = end_positions
172
 
 
173
  return tokenized_examples
174
 
175
  def preprocess_validation_batch(examples, tokenizer):
176
+ """Validation preprocessing - INCLUDES offset_mapping and example_id"""
 
 
177
  questions = examples["question"]
178
  contexts = examples["context"]
179
 
 
180
  tokenized_examples = tokenizer(
181
  questions,
182
  contexts,
 
188
  padding="max_length",
189
  )
190
 
 
191
  sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
192
 
 
193
  tokenized_examples["example_id"] = [
194
  examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
195
  ]
196
 
 
197
  return tokenized_examples
198
 
199
  def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_training=True):
200
+ """Process dataset in batches using HuggingFace's map function with batching."""
 
 
201
  print(f"πŸ”„ {desc} dataset with batch processing...")
202
 
203
  if is_training:
 
222
  def main():
223
  set_seed(SEED)
224
 
225
+ model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v4")
 
226
 
227
  if (tokn := os.getenv("roberta_token")):
228
  try:
 
242
 
243
  cuad = cuad.shuffle(seed=SEED)
244
 
245
+ # FIXED: Apply subset reduction more aggressively
246
  subset_size = SUBSET_SIZE if USE_SUBSET else None
247
+ cuad = balance_has_answer(cuad, ratio=1.5, max_samples=subset_size) # Reduced ratio too
248
  print(f"πŸ“Š Final dataset size: {len(cuad)} examples")
249
 
250
+ # Estimate features after preprocessing
251
+ avg_features_per_example = 2.5 # Conservative estimate with stride
252
+ estimated_features = len(cuad) * avg_features_per_example
253
+ print(f"πŸ“Š Estimated training features: ~{int(estimated_features)}")
254
+
255
  ds = cuad.train_test_split(test_size=0.1, seed=SEED)
256
  train_raw, val_raw = ds["train"], ds["test"]
257
 
258
+ # ── tokeniser & model ──
259
  base_ckpt = "deepset/roberta-base-squad2"
260
  tok = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
261
  model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)
262
 
263
+ # FIXED: Lighter LoRA config for faster training
264
  lora = LoraConfig(
265
  task_type=TaskType.QUESTION_ANS,
266
+ r=16, # Reduced from 32
267
+ lora_alpha=32, # Reduced from 64
268
+ lora_dropout=0.1,
269
+ target_modules=["query", "value"], # Fewer modules
270
  )
271
  model = get_peft_model(model, lora)
272
  model.print_trainable_parameters()
273
 
274
+ # ── preprocessing ─────────────────────────────────────────
275
+ print("πŸ”„ Starting preprocessing...")
276
 
 
277
  train_feats = preprocess_dataset_streaming(train_raw, tok, "Training", is_training=True)
 
 
278
  val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation", is_training=False)
279
 
280
  print(f"βœ… Preprocessing completed!")
281
  print(f" Training features: {len(train_feats)}")
282
  print(f" Validation features: {len(val_feats)}")
 
 
 
 
283
 
284
+ # ── training args - FIXED for reasonable training time ──
285
+ batch_size = 16 # Good balance
286
  gradient_accumulation_steps = 2
287
  effective_batch_size = batch_size * gradient_accumulation_steps
288
 
289
+ num_epochs = 3 # Keep it reasonable
290
  steps_per_epoch = len(train_feats) // effective_batch_size
291
  total_steps = steps_per_epoch * num_epochs
292
 
293
+ eval_steps = max(25, steps_per_epoch // 8) # More frequent eval
294
+ save_steps = eval_steps * 3
295
 
296
  print(f"πŸ“Š Training configuration:")
297
+ print(f" Effective batch size: {effective_batch_size}")
298
  print(f" Steps per epoch: {steps_per_epoch}")
299
  print(f" Total steps: {total_steps}")
300
+ print(f" Estimated time: ~{total_steps/2.4/60:.1f} minutes")
301
+ print(f" Eval every: {eval_steps} steps")
302
 
303
  args = TrainingArguments(
304
  output_dir="./cuad_lora_out",
305
+ learning_rate=3e-5, # Slightly lower LR
306
  num_train_epochs=num_epochs,
307
  per_device_train_batch_size=batch_size,
308
+ per_device_eval_batch_size=8,
309
  gradient_accumulation_steps=gradient_accumulation_steps,
310
  fp16=False, bf16=True,
311
  eval_strategy="steps",
 
315
  weight_decay=0.01,
316
  lr_scheduler_type="cosine",
317
  warmup_ratio=0.1,
318
+ load_best_model_at_end=False,
319
+ logging_steps=10, # More frequent logging
 
 
320
  report_to="none",
321
  dataloader_num_workers=2,
322
  dataloader_pin_memory=True,
 
330
  eval_dataset=val_feats,
331
  tokenizer=tok,
332
  data_collator=default_data_collator,
333
+ compute_metrics=None,
334
  )
335
 
336
  print("πŸš€ Training…")
 
351
  trainer.save_model("./cuad_lora_out")
352
  tok.save_pretrained("./cuad_lora_out")
353
 
354
+ # Push to hub
355
  if tokn:
356
  for attempt in range(3):
357
  try: