Spaces:
Running
Running
try to resolve the issue with sftt trainer or trackio
Browse files- trainer.py +25 -21
trainer.py
CHANGED
@@ -98,40 +98,44 @@ class SmolLM3Trainer:
|
|
98 |
callbacks.append(SimpleConsoleCallback())
|
99 |
logger.info("Added simple console monitoring callback")
|
100 |
|
101 |
-
# Try to add Trackio callback if available
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
trainer =
|
117 |
model=self.model.model,
|
|
|
|
|
118 |
train_dataset=train_dataset,
|
119 |
eval_dataset=eval_dataset,
|
120 |
-
args=training_args,
|
121 |
data_collator=data_collator,
|
122 |
callbacks=callbacks,
|
123 |
)
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
127 |
model=self.model.model,
|
128 |
-
tokenizer=self.model.tokenizer,
|
129 |
-
args=training_args,
|
130 |
train_dataset=train_dataset,
|
131 |
eval_dataset=eval_dataset,
|
|
|
132 |
data_collator=data_collator,
|
133 |
callbacks=callbacks,
|
134 |
)
|
|
|
135 |
|
136 |
return trainer
|
137 |
|
|
|
98 |
callbacks.append(SimpleConsoleCallback())
|
99 |
logger.info("Added simple console monitoring callback")
|
100 |
|
101 |
+
# Try to add Trackio callback if available (temporarily disabled for debugging)
|
102 |
+
logger.info("Skipping Trackio callback to debug training issue")
|
103 |
+
# if self.monitor and self.monitor.enable_tracking:
|
104 |
+
# try:
|
105 |
+
# trackio_callback = self.monitor.create_monitoring_callback()
|
106 |
+
# if trackio_callback:
|
107 |
+
# callbacks.append(trackio_callback)
|
108 |
+
# logger.info("Added Trackio monitoring callback")
|
109 |
+
# else:
|
110 |
+
# logger.warning("Failed to create Trackio callback")
|
111 |
+
# except Exception as e:
|
112 |
+
# logger.error(f"Error creating Trackio callback: {e}")
|
113 |
+
# logger.info("Continuing with console monitoring only")
|
114 |
|
115 |
+
# Try standard Trainer first (more stable with callbacks)
|
116 |
+
try:
|
117 |
+
trainer = Trainer(
|
118 |
model=self.model.model,
|
119 |
+
tokenizer=self.model.tokenizer,
|
120 |
+
args=training_args,
|
121 |
train_dataset=train_dataset,
|
122 |
eval_dataset=eval_dataset,
|
|
|
123 |
data_collator=data_collator,
|
124 |
callbacks=callbacks,
|
125 |
)
|
126 |
+
logger.info("Using standard Hugging Face Trainer")
|
127 |
+
except Exception as e:
|
128 |
+
logger.warning(f"Standard Trainer failed: {e}")
|
129 |
+
# Fallback to SFTTrainer
|
130 |
+
trainer = SFTTrainer(
|
131 |
model=self.model.model,
|
|
|
|
|
132 |
train_dataset=train_dataset,
|
133 |
eval_dataset=eval_dataset,
|
134 |
+
args=training_args,
|
135 |
data_collator=data_collator,
|
136 |
callbacks=callbacks,
|
137 |
)
|
138 |
+
logger.info("Using SFTTrainer")
|
139 |
|
140 |
return trainer
|
141 |
|