Tonic commited on
Commit
aa1f3a9
·
verified ·
1 Parent(s): 361d212

attempts to revert to previous training arguments

Browse files
Files changed (2) hide show
  1. model.py +39 -44
  2. trainer.py +17 -18
model.py CHANGED
@@ -128,50 +128,45 @@ class SmolLM3Model:
128
  logger.info(f"Config type: {type(self.config)}")
129
  logger.info(f"Config attributes: {[attr for attr in dir(self.config) if not attr.startswith('_')]}")
130
 
131
- # Merge config with kwargs
132
- training_args = {}
133
-
134
- # Add arguments one by one with error checking
135
- try:
136
- training_args["output_dir"] = output_dir
137
- training_args["per_device_train_batch_size"] = self.config.batch_size
138
- training_args["per_device_eval_batch_size"] = self.config.batch_size
139
- training_args["gradient_accumulation_steps"] = self.config.gradient_accumulation_steps
140
- training_args["learning_rate"] = self.config.learning_rate
141
- training_args["weight_decay"] = self.config.weight_decay
142
- training_args["warmup_steps"] = self.config.warmup_steps
143
- training_args["max_steps"] = self.config.max_iters
144
- training_args["save_steps"] = self.config.save_steps
145
- training_args["eval_steps"] = self.config.eval_steps
146
- training_args["logging_steps"] = self.config.logging_steps
147
- training_args["save_total_limit"] = self.config.save_total_limit
148
- training_args["eval_strategy"] = self.config.eval_strategy
149
- training_args["metric_for_best_model"] = self.config.metric_for_best_model
150
- training_args["greater_is_better"] = self.config.greater_is_better
151
- training_args["load_best_model_at_end"] = self.config.load_best_model_at_end
152
- training_args["fp16"] = self.config.fp16
153
- training_args["bf16"] = self.config.bf16
154
- training_args["ddp_backend"] = self.config.ddp_backend if torch.cuda.device_count() > 1 else None
155
- training_args["ddp_find_unused_parameters"] = self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False
156
- training_args["report_to"] = None
157
- training_args["remove_unused_columns"] = False
158
- training_args["dataloader_pin_memory"] = False
159
- training_args["group_by_length"] = True
160
- training_args["length_column_name"] = "length"
161
- training_args["ignore_data_skip"] = False
162
- training_args["seed"] = 42
163
- training_args["data_seed"] = 42
164
- training_args["dataloader_num_workers"] = getattr(self.config, 'dataloader_num_workers', 4)
165
- training_args["max_grad_norm"] = getattr(self.config, 'max_grad_norm', 1.0)
166
- training_args["optim"] = self.config.optimizer
167
- training_args["lr_scheduler_type"] = self.config.scheduler
168
- training_args["warmup_ratio"] = 0.1
169
- training_args["save_strategy"] = "steps"
170
- training_args["logging_strategy"] = "steps"
171
- training_args["prediction_loss_only"] = True
172
- except Exception as e:
173
- logger.error(f"Error creating training arguments: {e}")
174
- raise
175
 
176
  # Override with kwargs
177
  training_args.update(kwargs)
 
128
  logger.info(f"Config type: {type(self.config)}")
129
  logger.info(f"Config attributes: {[attr for attr in dir(self.config) if not attr.startswith('_')]}")
130
 
131
+ # Merge config with kwargs - using the working approach from the functioning commit
132
+ training_args = {
133
+ "output_dir": output_dir,
134
+ "per_device_train_batch_size": self.config.batch_size,
135
+ "per_device_eval_batch_size": self.config.batch_size,
136
+ "gradient_accumulation_steps": self.config.gradient_accumulation_steps,
137
+ "learning_rate": self.config.learning_rate,
138
+ "weight_decay": self.config.weight_decay,
139
+ "warmup_steps": self.config.warmup_steps,
140
+ "max_steps": self.config.max_iters,
141
+ "save_steps": self.config.save_steps,
142
+ "eval_steps": self.config.eval_steps,
143
+ "logging_steps": self.config.logging_steps,
144
+ "save_total_limit": self.config.save_total_limit,
145
+ "eval_strategy": self.config.eval_strategy,
146
+ "metric_for_best_model": self.config.metric_for_best_model,
147
+ "greater_is_better": self.config.greater_is_better,
148
+ "load_best_model_at_end": self.config.load_best_model_at_end,
149
+ "fp16": self.config.fp16,
150
+ "bf16": self.config.bf16,
151
+ "ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
152
+ "ddp_find_unused_parameters": self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False,
153
+ "report_to": None,
154
+ "remove_unused_columns": False,
155
+ "dataloader_pin_memory": False,
156
+ "group_by_length": True,
157
+ "length_column_name": "length",
158
+ "ignore_data_skip": False,
159
+ "seed": 42,
160
+ "data_seed": 42,
161
+ "dataloader_num_workers": getattr(self.config, 'dataloader_num_workers', 4),
162
+ "max_grad_norm": getattr(self.config, 'max_grad_norm', 1.0),
163
+ "optim": self.config.optimizer,
164
+ "lr_scheduler_type": self.config.scheduler,
165
+ "warmup_ratio": 0.1,
166
+ "save_strategy": "steps",
167
+ "logging_strategy": "steps",
168
+ "prediction_loss_only": True,
169
+ }
 
 
 
 
 
170
 
171
  # Override with kwargs
172
  training_args.update(kwargs)
trainer.py CHANGED
@@ -101,26 +101,25 @@ class SmolLM3Trainer:
101
  eval_loss = metrics.get('eval_loss', 'N/A')
102
  print(f"📊 Evaluation at step {step}: eval_loss={eval_loss}")
103
 
104
- # Temporarily disable callbacks to debug the issue
105
  callbacks = []
106
- logger.info("Callbacks disabled for debugging")
107
 
108
- # # Add simple console callback
109
- # callbacks.append(SimpleConsoleCallback())
110
- # logger.info("Added simple console monitoring callback")
111
- #
112
- # # Try to add Trackio callback if available
113
- # if self.monitor and self.monitor.enable_tracking:
114
- # try:
115
- # trackio_callback = self.monitor.create_monitoring_callback()
116
- # if trackio_callback:
117
- # callbacks.append(trackio_callback)
118
- # logger.info("Added Trackio monitoring callback")
119
- # else:
120
- # logger.warning("Failed to create Trackio callback")
121
- # except Exception as e:
122
- # logger.error(f"Error creating Trackio callback: {e}")
123
- # logger.info("Continuing with console monitoring only")
124
 
125
  # Try standard Trainer first (more stable with callbacks)
126
  logger.info("Creating Trainer with training arguments...")
 
101
  eval_loss = metrics.get('eval_loss', 'N/A')
102
  print(f"📊 Evaluation at step {step}: eval_loss={eval_loss}")
103
 
104
+ # Add monitoring callbacks
105
  callbacks = []
 
106
 
107
+ # Add simple console callback
108
+ callbacks.append(SimpleConsoleCallback())
109
+ logger.info("Added simple console monitoring callback")
110
+
111
+ # Try to add Trackio callback if available
112
+ if self.monitor and self.monitor.enable_tracking:
113
+ try:
114
+ trackio_callback = self.monitor.create_monitoring_callback()
115
+ if trackio_callback:
116
+ callbacks.append(trackio_callback)
117
+ logger.info("Added Trackio monitoring callback")
118
+ else:
119
+ logger.warning("Failed to create Trackio callback")
120
+ except Exception as e:
121
+ logger.error(f"Error creating Trackio callback: {e}")
122
+ logger.info("Continuing with console monitoring only")
123
 
124
  # Try standard Trainer first (more stable with callbacks)
125
  logger.info("Creating Trainer with training arguments...")