Tonic commited on
Commit
b13876b
·
verified ·
1 Parent(s): 084000d

attempts to identify trainer bug

Browse files
Files changed (1) hide show
  1. model.py +43 -39
model.py CHANGED
@@ -129,45 +129,49 @@ class SmolLM3Model:
129
  logger.info(f"Config attributes: {[attr for attr in dir(self.config) if not attr.startswith('_')]}")
130
 
131
  # Merge config with kwargs
132
- training_args = {
133
- "output_dir": output_dir,
134
- "per_device_train_batch_size": self.config.batch_size,
135
- "per_device_eval_batch_size": self.config.batch_size,
136
- "gradient_accumulation_steps": self.config.gradient_accumulation_steps,
137
- "learning_rate": self.config.learning_rate,
138
- "weight_decay": self.config.weight_decay,
139
- "warmup_steps": self.config.warmup_steps,
140
- "max_steps": self.config.max_iters,
141
- "save_steps": self.config.save_steps,
142
- "eval_steps": self.config.eval_steps,
143
- "logging_steps": self.config.logging_steps,
144
- "save_total_limit": self.config.save_total_limit,
145
- "eval_strategy": self.config.eval_strategy,
146
- "metric_for_best_model": self.config.metric_for_best_model,
147
- "greater_is_better": self.config.greater_is_better,
148
- "load_best_model_at_end": self.config.load_best_model_at_end,
149
- "fp16": self.config.fp16,
150
- "bf16": self.config.bf16,
151
- # Only enable DDP if multiple GPUs are available
152
- "ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
153
- "ddp_find_unused_parameters": self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False,
154
- "report_to": None, # Enable external logging (default)
155
- "remove_unused_columns": False,
156
- "dataloader_pin_memory": False,
157
- "group_by_length": True,
158
- "length_column_name": "length",
159
- "ignore_data_skip": False,
160
- "seed": 42,
161
- "data_seed": 42,
162
- "dataloader_num_workers": getattr(self.config, 'dataloader_num_workers', 4),
163
- "max_grad_norm": getattr(self.config, 'max_grad_norm', 1.0),
164
- "optim": self.config.optimizer,
165
- "lr_scheduler_type": self.config.scheduler,
166
- "warmup_ratio": 0.1,
167
- "save_strategy": "steps",
168
- "logging_strategy": "steps",
169
- "prediction_loss_only": True,
170
- }
 
 
 
 
171
 
172
  # Override with kwargs
173
  training_args.update(kwargs)
 
129
  logger.info(f"Config attributes: {[attr for attr in dir(self.config) if not attr.startswith('_')]}")
130
 
131
  # Merge config with kwargs
132
+ training_args = {}
133
+
134
+ # Add arguments one by one with error checking
135
+ try:
136
+ training_args["output_dir"] = output_dir
137
+ training_args["per_device_train_batch_size"] = self.config.batch_size
138
+ training_args["per_device_eval_batch_size"] = self.config.batch_size
139
+ training_args["gradient_accumulation_steps"] = self.config.gradient_accumulation_steps
140
+ training_args["learning_rate"] = self.config.learning_rate
141
+ training_args["weight_decay"] = self.config.weight_decay
142
+ training_args["warmup_steps"] = self.config.warmup_steps
143
+ training_args["max_steps"] = self.config.max_iters
144
+ training_args["save_steps"] = self.config.save_steps
145
+ training_args["eval_steps"] = self.config.eval_steps
146
+ training_args["logging_steps"] = self.config.logging_steps
147
+ training_args["save_total_limit"] = self.config.save_total_limit
148
+ training_args["eval_strategy"] = self.config.eval_strategy
149
+ training_args["metric_for_best_model"] = self.config.metric_for_best_model
150
+ training_args["greater_is_better"] = self.config.greater_is_better
151
+ training_args["load_best_model_at_end"] = self.config.load_best_model_at_end
152
+ training_args["fp16"] = self.config.fp16
153
+ training_args["bf16"] = self.config.bf16
154
+ training_args["ddp_backend"] = self.config.ddp_backend if torch.cuda.device_count() > 1 else None
155
+ training_args["ddp_find_unused_parameters"] = self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False
156
+ training_args["report_to"] = None
157
+ training_args["remove_unused_columns"] = False
158
+ training_args["dataloader_pin_memory"] = False
159
+ training_args["group_by_length"] = True
160
+ training_args["length_column_name"] = "length"
161
+ training_args["ignore_data_skip"] = False
162
+ training_args["seed"] = 42
163
+ training_args["data_seed"] = 42
164
+ training_args["dataloader_num_workers"] = getattr(self.config, 'dataloader_num_workers', 4)
165
+ training_args["max_grad_norm"] = getattr(self.config, 'max_grad_norm', 1.0)
166
+ training_args["optim"] = self.config.optimizer
167
+ training_args["lr_scheduler_type"] = self.config.scheduler
168
+ training_args["warmup_ratio"] = 0.1
169
+ training_args["save_strategy"] = "steps"
170
+ training_args["logging_strategy"] = "steps"
171
+ training_args["prediction_loss_only"] = True
172
+ except Exception as e:
173
+ logger.error(f"Error creating training arguments: {e}")
174
+ raise
175
 
176
  # Override with kwargs
177
  training_args.update(kwargs)