Tonic commited on
Commit
cdc0df1
·
verified ·
1 Parent(s): d4bee15

enable external logging boolean correctly

Browse files
Files changed (2) hide show
  1. model.py +2 -12
  2. trainer.py +12 -13
model.py CHANGED
@@ -151,7 +151,7 @@ class SmolLM3Model:
151
  # Only enable DDP if multiple GPUs are available
152
  "ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
153
  "ddp_find_unused_parameters": self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False,
154
- "report_to": "none", # Disable external logging
155
  "remove_unused_columns": False,
156
  "dataloader_pin_memory": False,
157
  "group_by_length": True,
@@ -172,17 +172,7 @@ class SmolLM3Model:
172
  # Override with kwargs
173
  training_args.update(kwargs)
174
 
175
- # Debug: Check for any boolean values that might be causing issues
176
- for key, value in training_args.items():
177
- if isinstance(value, bool):
178
- logger.info(f"Boolean argument: {key} = {value}")
179
-
180
- try:
181
- return TrainingArguments(**training_args)
182
- except Exception as e:
183
- logger.error(f"Failed to create TrainingArguments: {e}")
184
- logger.error(f"Training arguments: {training_args}")
185
- raise
186
 
187
  def save_pretrained(self, path: str):
188
  """Save model and tokenizer"""
 
151
  # Only enable DDP if multiple GPUs are available
152
  "ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
153
  "ddp_find_unused_parameters": self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False,
154
+ "report_to": None, # Enable external logging (default)
155
  "remove_unused_columns": False,
156
  "dataloader_pin_memory": False,
157
  "group_by_length": True,
 
172
  # Override with kwargs
173
  training_args.update(kwargs)
174
 
175
+ return TrainingArguments(**training_args)
 
 
 
 
 
 
 
 
 
 
176
 
177
  def save_pretrained(self, path: str):
178
  """Save model and tokenizer"""
trainer.py CHANGED
@@ -98,19 +98,18 @@ class SmolLM3Trainer:
98
  callbacks.append(SimpleConsoleCallback())
99
  logger.info("Added simple console monitoring callback")
100
 
101
- # Try to add Trackio callback if available (temporarily disabled for debugging)
102
- logger.info("Skipping Trackio callback to debug training issue")
103
- # if self.monitor and self.monitor.enable_tracking:
104
- # try:
105
- # trackio_callback = self.monitor.create_monitoring_callback()
106
- # if trackio_callback:
107
- # callbacks.append(trackio_callback)
108
- # logger.info("Added Trackio monitoring callback")
109
- # else:
110
- # logger.warning("Failed to create Trackio callback")
111
- # except Exception as e:
112
- # logger.error(f"Error creating Trackio callback: {e}")
113
- # logger.info("Continuing with console monitoring only")
114
 
115
  # Try standard Trainer first (more stable with callbacks)
116
  try:
 
98
  callbacks.append(SimpleConsoleCallback())
99
  logger.info("Added simple console monitoring callback")
100
 
101
+ # Try to add Trackio callback if available
102
+ if self.monitor and self.monitor.enable_tracking:
103
+ try:
104
+ trackio_callback = self.monitor.create_monitoring_callback()
105
+ if trackio_callback:
106
+ callbacks.append(trackio_callback)
107
+ logger.info("Added Trackio monitoring callback")
108
+ else:
109
+ logger.warning("Failed to create Trackio callback")
110
+ except Exception as e:
111
+ logger.error(f"Error creating Trackio callback: {e}")
112
+ logger.info("Continuing with console monitoring only")
 
113
 
114
  # Try standard Trainer first (more stable with callbacks)
115
  try: