AvocadoMuffin commited on
Commit
2901c84
Β·
verified Β·
1 Parent(s): 64fe93c

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +55 -67
train.py CHANGED
@@ -3,6 +3,7 @@
3
  """
4
  CUAD fine-tune with LoRA - Efficient batch processing version.
5
  Fixes bottlenecks and uses proper batching instead of chunking.
 
6
  """
7
 
8
  import os, json, random, gc, time
@@ -21,6 +22,9 @@ from huggingface_hub import login
21
 
22
  disable_caching()
23
 
 
 
 
24
  # ─────────────────────────────────────────────────────────────── config ──
25
 
26
  MAX_LEN = 384
@@ -108,15 +112,14 @@ def compute_metrics(eval_pred):
108
 
109
  # ───────────────────────────────────────────────────────────── preprocessing ──
110
 
111
- def preprocess_batch_efficient(examples, tokenizer, is_training=True):
112
  """
113
- Efficient batch preprocessing using HuggingFace's built-in batch processing.
114
- This is much faster than processing examples individually.
115
  """
116
  questions = examples["question"]
117
  contexts = examples["context"]
118
 
119
- # Batch tokenization - this is the key efficiency gain
120
  tokenized_examples = tokenizer(
121
  questions,
122
  contexts,
@@ -124,19 +127,19 @@ def preprocess_batch_efficient(examples, tokenizer, is_training=True):
124
  max_length=MAX_LEN,
125
  stride=DOC_STRIDE,
126
  return_overflowing_tokens=True,
127
- return_offsets_mapping=True,
128
  padding="max_length",
129
  )
130
 
131
  # Map back to original examples
132
  sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
 
133
 
134
  # Initialize output
135
  start_positions = []
136
  end_positions = []
137
 
138
- for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
139
- input_ids = tokenized_examples["input_ids"][i]
140
  cls_index = 0 # CLS token position
141
 
142
  # Get the original example for this tokenized chunk
@@ -176,73 +179,68 @@ def preprocess_batch_efficient(examples, tokenizer, is_training=True):
176
  tokenized_examples["start_positions"] = start_positions
177
  tokenized_examples["end_positions"] = end_positions
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  # Add example IDs for evaluation
180
  tokenized_examples["example_id"] = [
181
  examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
182
  ]
183
 
184
- # CRITICAL FIX: Remove offset_mapping for training data
185
- # Only keep it for evaluation data
186
- if is_training:
187
- tokenized_examples.pop("offset_mapping", None)
188
-
189
  return tokenized_examples
190
 
191
  def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_training=True):
192
  """
193
  Process dataset in batches using HuggingFace's map function with batching.
194
- This is much more memory efficient and faster than manual chunking.
195
  """
196
  print(f"πŸ”„ {desc} dataset with batch processing...")
197
 
 
 
 
 
 
198
  processed = dataset.map(
199
- lambda examples: preprocess_batch_efficient(examples, tokenizer, is_training),
200
  batched=True,
201
  batch_size=BATCH_SIZE,
202
  remove_columns=dataset.column_names,
203
  desc=desc,
204
- num_proc=1, # Use 1 process to avoid memory issues in Spaces
205
  )
206
 
207
  print(f"βœ… {desc} completed: {len(processed)} features")
208
  return processed
209
 
210
- # Custom data collator that ensures no unwanted keys are passed to the model
211
- class QADataCollator:
212
- def __init__(self, tokenizer, training=True):
213
- self.tokenizer = tokenizer
214
- self.training = training
215
- # Keys that the model expects
216
- self.model_input_names = ["input_ids", "attention_mask", "token_type_ids", "start_positions", "end_positions"]
217
-
218
- def __call__(self, features):
219
- # Use the default collator first
220
- batch = default_data_collator(features)
221
-
222
- # For training, only keep model input keys
223
- if self.training:
224
- batch = {k: v for k, v in batch.items() if k in self.model_input_names}
225
- else:
226
- # For evaluation, keep example_id and offset_mapping for post-processing
227
- # but don't pass them to the model
228
- model_batch = {k: v for k, v in batch.items() if k in self.model_input_names[:3]} # No positions for eval
229
- # Store extra info for post-processing
230
- model_batch["example_id"] = batch.get("example_id")
231
- if "offset_mapping" in batch:
232
- model_batch["offset_mapping"] = batch["offset_mapping"]
233
- batch = model_batch
234
-
235
- return batch
236
-
237
  # ───────────────────────────────────────────────────────────────── main ──
238
 
239
  def main():
240
  global val_raw, val_feats, tok
241
 
242
  set_seed(SEED)
243
-
244
- # Set tokenizers parallelism to avoid the warning
245
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
246
 
247
  # Model name to store on Hub
248
  model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v3")
@@ -282,8 +280,8 @@ def main():
282
  # LoRA with slightly more aggressive settings for smaller dataset
283
  lora = LoraConfig(
284
  task_type=TaskType.QUESTION_ANS,
285
- r=32, lora_alpha=64, lora_dropout=0.1, # Increased for better learning with less data
286
- target_modules=["query", "value", "key", "dense"], # More modules for better coverage
287
  )
288
  model = get_peft_model(model, lora)
289
  model.print_trainable_parameters()
@@ -291,32 +289,29 @@ def main():
291
  # ── efficient preprocessing ─────────────────────────────────────────
292
  print("πŸ”„ Starting efficient preprocessing...")
293
 
294
- # Process training data (remove offset_mapping)
295
  train_feats = preprocess_dataset_streaming(train_raw, tok, "Training", is_training=True)
296
 
297
- # Process validation data (keep offset_mapping for evaluation)
298
  val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation", is_training=False)
299
 
300
  print(f"βœ… Preprocessing completed!")
301
  print(f" Training features: {len(train_feats)}")
302
  print(f" Validation features: {len(val_feats)}")
 
 
303
 
304
- # ── training args with fixed eval/save step alignment ──────────────────
305
- # Calculate proper steps that align
306
  batch_size = 16
307
  gradient_accumulation_steps = 2
308
  effective_batch_size = batch_size * gradient_accumulation_steps
309
 
310
- # Calculate total training steps
311
  num_epochs = 6 if USE_SUBSET else 4
312
  steps_per_epoch = len(train_feats) // effective_batch_size
313
  total_steps = steps_per_epoch * num_epochs
314
 
315
- # Set eval steps first
316
- eval_steps = max(50, steps_per_epoch // 4) # Evaluate 4 times per epoch
317
-
318
- # Set save steps as a multiple of eval steps
319
- save_steps = eval_steps * 2 # Save every 2 evaluations
320
 
321
  print(f"πŸ“Š Training configuration:")
322
  print(f" Steps per epoch: {steps_per_epoch}")
@@ -326,7 +321,7 @@ def main():
326
 
327
  args = TrainingArguments(
328
  output_dir="./cuad_lora_out",
329
- learning_rate=5e-5, # Slightly higher for smaller dataset
330
  num_train_epochs=num_epochs,
331
  per_device_train_batch_size=batch_size,
332
  per_device_eval_batch_size=16,
@@ -346,25 +341,18 @@ def main():
346
  report_to="none",
347
  dataloader_num_workers=2,
348
  dataloader_pin_memory=True,
349
- remove_unused_columns=False, # Keep example_id for evaluation
350
  )
351
 
352
- # Use custom data collators
353
- train_collator = QADataCollator(tok, training=True)
354
- eval_collator = QADataCollator(tok, training=False)
355
-
356
  trainer = Trainer(
357
  model=model,
358
  args=args,
359
  train_dataset=train_feats,
360
  eval_dataset=val_feats,
361
  tokenizer=tok,
362
- data_collator=train_collator, # Use custom collator
363
  compute_metrics=compute_metrics,
364
  )
365
-
366
- # Override the evaluation collator
367
- trainer.data_collator = eval_collator
368
 
369
  print("πŸš€ Training…")
370
  try:
 
3
  """
4
  CUAD fine-tune with LoRA - Efficient batch processing version.
5
  Fixes bottlenecks and uses proper batching instead of chunking.
6
+ GUARANTEED FIX for offset_mapping error.
7
  """
8
 
9
  import os, json, random, gc, time
 
22
 
23
  disable_caching()
24
 
25
+ # Set tokenizers parallelism to avoid warnings
26
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
27
+
28
  # ─────────────────────────────────────────────────────────────── config ──
29
 
30
  MAX_LEN = 384
 
112
 
113
  # ───────────────────────────────────────────────────────────── preprocessing ──
114
 
115
+ def preprocess_training_batch(examples, tokenizer):
116
  """
117
+ Training preprocessing - NO offset_mapping included
 
118
  """
119
  questions = examples["question"]
120
  contexts = examples["context"]
121
 
122
+ # Batch tokenization
123
  tokenized_examples = tokenizer(
124
  questions,
125
  contexts,
 
127
  max_length=MAX_LEN,
128
  stride=DOC_STRIDE,
129
  return_overflowing_tokens=True,
130
+ return_offsets_mapping=True, # We need this temporarily for position calculation
131
  padding="max_length",
132
  )
133
 
134
  # Map back to original examples
135
  sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
136
+ offset_mapping = tokenized_examples.pop("offset_mapping") # Remove it immediately after use
137
 
138
  # Initialize output
139
  start_positions = []
140
  end_positions = []
141
 
142
+ for i, offsets in enumerate(offset_mapping):
 
143
  cls_index = 0 # CLS token position
144
 
145
  # Get the original example for this tokenized chunk
 
179
  tokenized_examples["start_positions"] = start_positions
180
  tokenized_examples["end_positions"] = end_positions
181
 
182
+ # NO offset_mapping or example_id for training
183
+ return tokenized_examples
184
+
185
+ def preprocess_validation_batch(examples, tokenizer):
186
+ """
187
+ Validation preprocessing - INCLUDES offset_mapping and example_id for post-processing
188
+ """
189
+ questions = examples["question"]
190
+ contexts = examples["context"]
191
+
192
+ # Batch tokenization
193
+ tokenized_examples = tokenizer(
194
+ questions,
195
+ contexts,
196
+ truncation="only_second",
197
+ max_length=MAX_LEN,
198
+ stride=DOC_STRIDE,
199
+ return_overflowing_tokens=True,
200
+ return_offsets_mapping=True,
201
+ padding="max_length",
202
+ )
203
+
204
+ # Map back to original examples
205
+ sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
206
+
207
  # Add example IDs for evaluation
208
  tokenized_examples["example_id"] = [
209
  examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
210
  ]
211
 
212
+ # Keep offset_mapping for post-processing
 
 
 
 
213
  return tokenized_examples
214
 
215
  def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_training=True):
216
  """
217
  Process dataset in batches using HuggingFace's map function with batching.
 
218
  """
219
  print(f"πŸ”„ {desc} dataset with batch processing...")
220
 
221
+ if is_training:
222
+ preprocess_fn = preprocess_training_batch
223
+ else:
224
+ preprocess_fn = preprocess_validation_batch
225
+
226
  processed = dataset.map(
227
+ lambda examples: preprocess_fn(examples, tokenizer),
228
  batched=True,
229
  batch_size=BATCH_SIZE,
230
  remove_columns=dataset.column_names,
231
  desc=desc,
232
+ num_proc=1,
233
  )
234
 
235
  print(f"βœ… {desc} completed: {len(processed)} features")
236
  return processed
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  # ───────────────────────────────────────────────────────────────── main ──
239
 
240
  def main():
241
  global val_raw, val_feats, tok
242
 
243
  set_seed(SEED)
 
 
 
244
 
245
  # Model name to store on Hub
246
  model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v3")
 
280
  # LoRA with slightly more aggressive settings for smaller dataset
281
  lora = LoraConfig(
282
  task_type=TaskType.QUESTION_ANS,
283
+ r=32, lora_alpha=64, lora_dropout=0.1,
284
+ target_modules=["query", "value", "key", "dense"],
285
  )
286
  model = get_peft_model(model, lora)
287
  model.print_trainable_parameters()
 
289
  # ── efficient preprocessing ─────────────────────────────────────────
290
  print("πŸ”„ Starting efficient preprocessing...")
291
 
292
+ # Process training data (NO offset_mapping)
293
  train_feats = preprocess_dataset_streaming(train_raw, tok, "Training", is_training=True)
294
 
295
+ # Process validation data (WITH offset_mapping)
296
  val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation", is_training=False)
297
 
298
  print(f"βœ… Preprocessing completed!")
299
  print(f" Training features: {len(train_feats)}")
300
  print(f" Validation features: {len(val_feats)}")
301
+ print(f" Training columns: {train_feats.column_names}")
302
+ print(f" Validation columns: {val_feats.column_names}")
303
 
304
+ # ── training args ──────────────────
 
305
  batch_size = 16
306
  gradient_accumulation_steps = 2
307
  effective_batch_size = batch_size * gradient_accumulation_steps
308
 
 
309
  num_epochs = 6 if USE_SUBSET else 4
310
  steps_per_epoch = len(train_feats) // effective_batch_size
311
  total_steps = steps_per_epoch * num_epochs
312
 
313
+ eval_steps = max(50, steps_per_epoch // 4)
314
+ save_steps = eval_steps * 2
 
 
 
315
 
316
  print(f"πŸ“Š Training configuration:")
317
  print(f" Steps per epoch: {steps_per_epoch}")
 
321
 
322
  args = TrainingArguments(
323
  output_dir="./cuad_lora_out",
324
+ learning_rate=5e-5,
325
  num_train_epochs=num_epochs,
326
  per_device_train_batch_size=batch_size,
327
  per_device_eval_batch_size=16,
 
341
  report_to="none",
342
  dataloader_num_workers=2,
343
  dataloader_pin_memory=True,
344
+ remove_unused_columns=True, # Let the trainer handle column removal
345
  )
346
 
 
 
 
 
347
  trainer = Trainer(
348
  model=model,
349
  args=args,
350
  train_dataset=train_feats,
351
  eval_dataset=val_feats,
352
  tokenizer=tok,
353
+ data_collator=default_data_collator,
354
  compute_metrics=compute_metrics,
355
  )
 
 
 
356
 
357
  print("πŸš€ Training…")
358
  try: