Spaces:
Sleeping
Sleeping
Update train.py
Browse files
train.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
"""
|
4 |
CUAD fine-tune with LoRA - Efficient batch processing version.
|
5 |
Fixes bottlenecks and uses proper batching instead of chunking.
|
|
|
6 |
"""
|
7 |
|
8 |
import os, json, random, gc, time
|
@@ -21,6 +22,9 @@ from huggingface_hub import login
|
|
21 |
|
22 |
disable_caching()
|
23 |
|
|
|
|
|
|
|
24 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ config ββ
|
25 |
|
26 |
MAX_LEN = 384
|
@@ -108,15 +112,14 @@ def compute_metrics(eval_pred):
|
|
108 |
|
109 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ preprocessing ββ
|
110 |
|
111 |
-
def
|
112 |
"""
|
113 |
-
|
114 |
-
This is much faster than processing examples individually.
|
115 |
"""
|
116 |
questions = examples["question"]
|
117 |
contexts = examples["context"]
|
118 |
|
119 |
-
# Batch tokenization
|
120 |
tokenized_examples = tokenizer(
|
121 |
questions,
|
122 |
contexts,
|
@@ -124,19 +127,19 @@ def preprocess_batch_efficient(examples, tokenizer, is_training=True):
|
|
124 |
max_length=MAX_LEN,
|
125 |
stride=DOC_STRIDE,
|
126 |
return_overflowing_tokens=True,
|
127 |
-
return_offsets_mapping=True,
|
128 |
padding="max_length",
|
129 |
)
|
130 |
|
131 |
# Map back to original examples
|
132 |
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
|
|
133 |
|
134 |
# Initialize output
|
135 |
start_positions = []
|
136 |
end_positions = []
|
137 |
|
138 |
-
for i, offsets in enumerate(
|
139 |
-
input_ids = tokenized_examples["input_ids"][i]
|
140 |
cls_index = 0 # CLS token position
|
141 |
|
142 |
# Get the original example for this tokenized chunk
|
@@ -176,73 +179,68 @@ def preprocess_batch_efficient(examples, tokenizer, is_training=True):
|
|
176 |
tokenized_examples["start_positions"] = start_positions
|
177 |
tokenized_examples["end_positions"] = end_positions
|
178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
# Add example IDs for evaluation
|
180 |
tokenized_examples["example_id"] = [
|
181 |
examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
|
182 |
]
|
183 |
|
184 |
-
#
|
185 |
-
# Only keep it for evaluation data
|
186 |
-
if is_training:
|
187 |
-
tokenized_examples.pop("offset_mapping", None)
|
188 |
-
|
189 |
return tokenized_examples
|
190 |
|
191 |
def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_training=True):
|
192 |
"""
|
193 |
Process dataset in batches using HuggingFace's map function with batching.
|
194 |
-
This is much more memory efficient and faster than manual chunking.
|
195 |
"""
|
196 |
print(f"π {desc} dataset with batch processing...")
|
197 |
|
|
|
|
|
|
|
|
|
|
|
198 |
processed = dataset.map(
|
199 |
-
lambda examples:
|
200 |
batched=True,
|
201 |
batch_size=BATCH_SIZE,
|
202 |
remove_columns=dataset.column_names,
|
203 |
desc=desc,
|
204 |
-
num_proc=1,
|
205 |
)
|
206 |
|
207 |
print(f"β
{desc} completed: {len(processed)} features")
|
208 |
return processed
|
209 |
|
210 |
-
# Custom data collator that ensures no unwanted keys are passed to the model
|
211 |
-
class QADataCollator:
|
212 |
-
def __init__(self, tokenizer, training=True):
|
213 |
-
self.tokenizer = tokenizer
|
214 |
-
self.training = training
|
215 |
-
# Keys that the model expects
|
216 |
-
self.model_input_names = ["input_ids", "attention_mask", "token_type_ids", "start_positions", "end_positions"]
|
217 |
-
|
218 |
-
def __call__(self, features):
|
219 |
-
# Use the default collator first
|
220 |
-
batch = default_data_collator(features)
|
221 |
-
|
222 |
-
# For training, only keep model input keys
|
223 |
-
if self.training:
|
224 |
-
batch = {k: v for k, v in batch.items() if k in self.model_input_names}
|
225 |
-
else:
|
226 |
-
# For evaluation, keep example_id and offset_mapping for post-processing
|
227 |
-
# but don't pass them to the model
|
228 |
-
model_batch = {k: v for k, v in batch.items() if k in self.model_input_names[:3]} # No positions for eval
|
229 |
-
# Store extra info for post-processing
|
230 |
-
model_batch["example_id"] = batch.get("example_id")
|
231 |
-
if "offset_mapping" in batch:
|
232 |
-
model_batch["offset_mapping"] = batch["offset_mapping"]
|
233 |
-
batch = model_batch
|
234 |
-
|
235 |
-
return batch
|
236 |
-
|
237 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ main ββ
|
238 |
|
239 |
def main():
|
240 |
global val_raw, val_feats, tok
|
241 |
|
242 |
set_seed(SEED)
|
243 |
-
|
244 |
-
# Set tokenizers parallelism to avoid the warning
|
245 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
246 |
|
247 |
# Model name to store on Hub
|
248 |
model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v3")
|
@@ -282,8 +280,8 @@ def main():
|
|
282 |
# LoRA with slightly more aggressive settings for smaller dataset
|
283 |
lora = LoraConfig(
|
284 |
task_type=TaskType.QUESTION_ANS,
|
285 |
-
r=32, lora_alpha=64, lora_dropout=0.1,
|
286 |
-
target_modules=["query", "value", "key", "dense"],
|
287 |
)
|
288 |
model = get_peft_model(model, lora)
|
289 |
model.print_trainable_parameters()
|
@@ -291,32 +289,29 @@ def main():
|
|
291 |
# ββ efficient preprocessing βββββββββββββββββββββββββββββββββββββββββ
|
292 |
print("π Starting efficient preprocessing...")
|
293 |
|
294 |
-
# Process training data (
|
295 |
train_feats = preprocess_dataset_streaming(train_raw, tok, "Training", is_training=True)
|
296 |
|
297 |
-
# Process validation data (
|
298 |
val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation", is_training=False)
|
299 |
|
300 |
print(f"β
Preprocessing completed!")
|
301 |
print(f" Training features: {len(train_feats)}")
|
302 |
print(f" Validation features: {len(val_feats)}")
|
|
|
|
|
303 |
|
304 |
-
# ββ training args
|
305 |
-
# Calculate proper steps that align
|
306 |
batch_size = 16
|
307 |
gradient_accumulation_steps = 2
|
308 |
effective_batch_size = batch_size * gradient_accumulation_steps
|
309 |
|
310 |
-
# Calculate total training steps
|
311 |
num_epochs = 6 if USE_SUBSET else 4
|
312 |
steps_per_epoch = len(train_feats) // effective_batch_size
|
313 |
total_steps = steps_per_epoch * num_epochs
|
314 |
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
# Set save steps as a multiple of eval steps
|
319 |
-
save_steps = eval_steps * 2 # Save every 2 evaluations
|
320 |
|
321 |
print(f"π Training configuration:")
|
322 |
print(f" Steps per epoch: {steps_per_epoch}")
|
@@ -326,7 +321,7 @@ def main():
|
|
326 |
|
327 |
args = TrainingArguments(
|
328 |
output_dir="./cuad_lora_out",
|
329 |
-
learning_rate=5e-5,
|
330 |
num_train_epochs=num_epochs,
|
331 |
per_device_train_batch_size=batch_size,
|
332 |
per_device_eval_batch_size=16,
|
@@ -346,25 +341,18 @@ def main():
|
|
346 |
report_to="none",
|
347 |
dataloader_num_workers=2,
|
348 |
dataloader_pin_memory=True,
|
349 |
-
remove_unused_columns=
|
350 |
)
|
351 |
|
352 |
-
# Use custom data collators
|
353 |
-
train_collator = QADataCollator(tok, training=True)
|
354 |
-
eval_collator = QADataCollator(tok, training=False)
|
355 |
-
|
356 |
trainer = Trainer(
|
357 |
model=model,
|
358 |
args=args,
|
359 |
train_dataset=train_feats,
|
360 |
eval_dataset=val_feats,
|
361 |
tokenizer=tok,
|
362 |
-
data_collator=
|
363 |
compute_metrics=compute_metrics,
|
364 |
)
|
365 |
-
|
366 |
-
# Override the evaluation collator
|
367 |
-
trainer.data_collator = eval_collator
|
368 |
|
369 |
print("π Trainingβ¦")
|
370 |
try:
|
|
|
3 |
"""
|
4 |
CUAD fine-tune with LoRA - Efficient batch processing version.
|
5 |
Fixes bottlenecks and uses proper batching instead of chunking.
|
6 |
+
GUARANTEED FIX for offset_mapping error.
|
7 |
"""
|
8 |
|
9 |
import os, json, random, gc, time
|
|
|
22 |
|
23 |
disable_caching()
|
24 |
|
25 |
+
# Set tokenizers parallelism to avoid warnings
|
26 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
27 |
+
|
28 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ config ββ
|
29 |
|
30 |
MAX_LEN = 384
|
|
|
112 |
|
113 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ preprocessing ββ
|
114 |
|
115 |
+
def preprocess_training_batch(examples, tokenizer):
|
116 |
"""
|
117 |
+
Training preprocessing - NO offset_mapping included
|
|
|
118 |
"""
|
119 |
questions = examples["question"]
|
120 |
contexts = examples["context"]
|
121 |
|
122 |
+
# Batch tokenization
|
123 |
tokenized_examples = tokenizer(
|
124 |
questions,
|
125 |
contexts,
|
|
|
127 |
max_length=MAX_LEN,
|
128 |
stride=DOC_STRIDE,
|
129 |
return_overflowing_tokens=True,
|
130 |
+
return_offsets_mapping=True, # We need this temporarily for position calculation
|
131 |
padding="max_length",
|
132 |
)
|
133 |
|
134 |
# Map back to original examples
|
135 |
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
136 |
+
offset_mapping = tokenized_examples.pop("offset_mapping") # Remove it immediately after use
|
137 |
|
138 |
# Initialize output
|
139 |
start_positions = []
|
140 |
end_positions = []
|
141 |
|
142 |
+
for i, offsets in enumerate(offset_mapping):
|
|
|
143 |
cls_index = 0 # CLS token position
|
144 |
|
145 |
# Get the original example for this tokenized chunk
|
|
|
179 |
tokenized_examples["start_positions"] = start_positions
|
180 |
tokenized_examples["end_positions"] = end_positions
|
181 |
|
182 |
+
# NO offset_mapping or example_id for training
|
183 |
+
return tokenized_examples
|
184 |
+
|
185 |
+
def preprocess_validation_batch(examples, tokenizer):
|
186 |
+
"""
|
187 |
+
Validation preprocessing - INCLUDES offset_mapping and example_id for post-processing
|
188 |
+
"""
|
189 |
+
questions = examples["question"]
|
190 |
+
contexts = examples["context"]
|
191 |
+
|
192 |
+
# Batch tokenization
|
193 |
+
tokenized_examples = tokenizer(
|
194 |
+
questions,
|
195 |
+
contexts,
|
196 |
+
truncation="only_second",
|
197 |
+
max_length=MAX_LEN,
|
198 |
+
stride=DOC_STRIDE,
|
199 |
+
return_overflowing_tokens=True,
|
200 |
+
return_offsets_mapping=True,
|
201 |
+
padding="max_length",
|
202 |
+
)
|
203 |
+
|
204 |
+
# Map back to original examples
|
205 |
+
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
206 |
+
|
207 |
# Add example IDs for evaluation
|
208 |
tokenized_examples["example_id"] = [
|
209 |
examples["id"][sample_mapping[i]] for i in range(len(tokenized_examples["input_ids"]))
|
210 |
]
|
211 |
|
212 |
+
# Keep offset_mapping for post-processing
|
|
|
|
|
|
|
|
|
213 |
return tokenized_examples
|
214 |
|
215 |
def preprocess_dataset_streaming(dataset, tokenizer, desc="Processing", is_training=True):
|
216 |
"""
|
217 |
Process dataset in batches using HuggingFace's map function with batching.
|
|
|
218 |
"""
|
219 |
print(f"π {desc} dataset with batch processing...")
|
220 |
|
221 |
+
if is_training:
|
222 |
+
preprocess_fn = preprocess_training_batch
|
223 |
+
else:
|
224 |
+
preprocess_fn = preprocess_validation_batch
|
225 |
+
|
226 |
processed = dataset.map(
|
227 |
+
lambda examples: preprocess_fn(examples, tokenizer),
|
228 |
batched=True,
|
229 |
batch_size=BATCH_SIZE,
|
230 |
remove_columns=dataset.column_names,
|
231 |
desc=desc,
|
232 |
+
num_proc=1,
|
233 |
)
|
234 |
|
235 |
print(f"β
{desc} completed: {len(processed)} features")
|
236 |
return processed
|
237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ main ββ
|
239 |
|
240 |
def main():
|
241 |
global val_raw, val_feats, tok
|
242 |
|
243 |
set_seed(SEED)
|
|
|
|
|
|
|
244 |
|
245 |
# Model name to store on Hub
|
246 |
model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v3")
|
|
|
280 |
# LoRA with slightly more aggressive settings for smaller dataset
|
281 |
lora = LoraConfig(
|
282 |
task_type=TaskType.QUESTION_ANS,
|
283 |
+
r=32, lora_alpha=64, lora_dropout=0.1,
|
284 |
+
target_modules=["query", "value", "key", "dense"],
|
285 |
)
|
286 |
model = get_peft_model(model, lora)
|
287 |
model.print_trainable_parameters()
|
|
|
289 |
# ββ efficient preprocessing βββββββββββββββββββββββββββββββββββββββββ
|
290 |
print("π Starting efficient preprocessing...")
|
291 |
|
292 |
+
# Process training data (NO offset_mapping)
|
293 |
train_feats = preprocess_dataset_streaming(train_raw, tok, "Training", is_training=True)
|
294 |
|
295 |
+
# Process validation data (WITH offset_mapping)
|
296 |
val_feats = preprocess_dataset_streaming(val_raw, tok, "Validation", is_training=False)
|
297 |
|
298 |
print(f"β
Preprocessing completed!")
|
299 |
print(f" Training features: {len(train_feats)}")
|
300 |
print(f" Validation features: {len(val_feats)}")
|
301 |
+
print(f" Training columns: {train_feats.column_names}")
|
302 |
+
print(f" Validation columns: {val_feats.column_names}")
|
303 |
|
304 |
+
# ββ training args ββββββββββββββββββ
|
|
|
305 |
batch_size = 16
|
306 |
gradient_accumulation_steps = 2
|
307 |
effective_batch_size = batch_size * gradient_accumulation_steps
|
308 |
|
|
|
309 |
num_epochs = 6 if USE_SUBSET else 4
|
310 |
steps_per_epoch = len(train_feats) // effective_batch_size
|
311 |
total_steps = steps_per_epoch * num_epochs
|
312 |
|
313 |
+
eval_steps = max(50, steps_per_epoch // 4)
|
314 |
+
save_steps = eval_steps * 2
|
|
|
|
|
|
|
315 |
|
316 |
print(f"π Training configuration:")
|
317 |
print(f" Steps per epoch: {steps_per_epoch}")
|
|
|
321 |
|
322 |
args = TrainingArguments(
|
323 |
output_dir="./cuad_lora_out",
|
324 |
+
learning_rate=5e-5,
|
325 |
num_train_epochs=num_epochs,
|
326 |
per_device_train_batch_size=batch_size,
|
327 |
per_device_eval_batch_size=16,
|
|
|
341 |
report_to="none",
|
342 |
dataloader_num_workers=2,
|
343 |
dataloader_pin_memory=True,
|
344 |
+
remove_unused_columns=True, # Let the trainer handle column removal
|
345 |
)
|
346 |
|
|
|
|
|
|
|
|
|
347 |
trainer = Trainer(
|
348 |
model=model,
|
349 |
args=args,
|
350 |
train_dataset=train_feats,
|
351 |
eval_dataset=val_feats,
|
352 |
tokenizer=tok,
|
353 |
+
data_collator=default_data_collator,
|
354 |
compute_metrics=compute_metrics,
|
355 |
)
|
|
|
|
|
|
|
356 |
|
357 |
print("π Trainingβ¦")
|
358 |
try:
|