Spaces:
Running
Running
attempts to revert to previous training arguments
Browse files- model.py +39 -44
- trainer.py +17 -18
model.py
CHANGED
@@ -128,50 +128,45 @@ class SmolLM3Model:
|
|
128 |
logger.info(f"Config type: {type(self.config)}")
|
129 |
logger.info(f"Config attributes: {[attr for attr in dir(self.config) if not attr.startswith('_')]}")
|
130 |
|
131 |
-
# Merge config with kwargs
|
132 |
-
training_args = {
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
training_args["logging_strategy"] = "steps"
|
171 |
-
training_args["prediction_loss_only"] = True
|
172 |
-
except Exception as e:
|
173 |
-
logger.error(f"Error creating training arguments: {e}")
|
174 |
-
raise
|
175 |
|
176 |
# Override with kwargs
|
177 |
training_args.update(kwargs)
|
|
|
128 |
logger.info(f"Config type: {type(self.config)}")
|
129 |
logger.info(f"Config attributes: {[attr for attr in dir(self.config) if not attr.startswith('_')]}")
|
130 |
|
131 |
+
# Merge config with kwargs - using the working approach from the functioning commit
|
132 |
+
training_args = {
|
133 |
+
"output_dir": output_dir,
|
134 |
+
"per_device_train_batch_size": self.config.batch_size,
|
135 |
+
"per_device_eval_batch_size": self.config.batch_size,
|
136 |
+
"gradient_accumulation_steps": self.config.gradient_accumulation_steps,
|
137 |
+
"learning_rate": self.config.learning_rate,
|
138 |
+
"weight_decay": self.config.weight_decay,
|
139 |
+
"warmup_steps": self.config.warmup_steps,
|
140 |
+
"max_steps": self.config.max_iters,
|
141 |
+
"save_steps": self.config.save_steps,
|
142 |
+
"eval_steps": self.config.eval_steps,
|
143 |
+
"logging_steps": self.config.logging_steps,
|
144 |
+
"save_total_limit": self.config.save_total_limit,
|
145 |
+
"eval_strategy": self.config.eval_strategy,
|
146 |
+
"metric_for_best_model": self.config.metric_for_best_model,
|
147 |
+
"greater_is_better": self.config.greater_is_better,
|
148 |
+
"load_best_model_at_end": self.config.load_best_model_at_end,
|
149 |
+
"fp16": self.config.fp16,
|
150 |
+
"bf16": self.config.bf16,
|
151 |
+
"ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
|
152 |
+
"ddp_find_unused_parameters": self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False,
|
153 |
+
"report_to": None,
|
154 |
+
"remove_unused_columns": False,
|
155 |
+
"dataloader_pin_memory": False,
|
156 |
+
"group_by_length": True,
|
157 |
+
"length_column_name": "length",
|
158 |
+
"ignore_data_skip": False,
|
159 |
+
"seed": 42,
|
160 |
+
"data_seed": 42,
|
161 |
+
"dataloader_num_workers": getattr(self.config, 'dataloader_num_workers', 4),
|
162 |
+
"max_grad_norm": getattr(self.config, 'max_grad_norm', 1.0),
|
163 |
+
"optim": self.config.optimizer,
|
164 |
+
"lr_scheduler_type": self.config.scheduler,
|
165 |
+
"warmup_ratio": 0.1,
|
166 |
+
"save_strategy": "steps",
|
167 |
+
"logging_strategy": "steps",
|
168 |
+
"prediction_loss_only": True,
|
169 |
+
}
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
# Override with kwargs
|
172 |
training_args.update(kwargs)
|
trainer.py
CHANGED
@@ -101,26 +101,25 @@ class SmolLM3Trainer:
|
|
101 |
eval_loss = metrics.get('eval_loss', 'N/A')
|
102 |
print(f"📊 Evaluation at step {step}: eval_loss={eval_loss}")
|
103 |
|
104 |
-
#
|
105 |
callbacks = []
|
106 |
-
logger.info("Callbacks disabled for debugging")
|
107 |
|
108 |
-
#
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
#
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
|
125 |
# Try standard Trainer first (more stable with callbacks)
|
126 |
logger.info("Creating Trainer with training arguments...")
|
|
|
101 |
eval_loss = metrics.get('eval_loss', 'N/A')
|
102 |
print(f"📊 Evaluation at step {step}: eval_loss={eval_loss}")
|
103 |
|
104 |
+
# Add monitoring callbacks
|
105 |
callbacks = []
|
|
|
106 |
|
107 |
+
# Add simple console callback
|
108 |
+
callbacks.append(SimpleConsoleCallback())
|
109 |
+
logger.info("Added simple console monitoring callback")
|
110 |
+
|
111 |
+
# Try to add Trackio callback if available
|
112 |
+
if self.monitor and self.monitor.enable_tracking:
|
113 |
+
try:
|
114 |
+
trackio_callback = self.monitor.create_monitoring_callback()
|
115 |
+
if trackio_callback:
|
116 |
+
callbacks.append(trackio_callback)
|
117 |
+
logger.info("Added Trackio monitoring callback")
|
118 |
+
else:
|
119 |
+
logger.warning("Failed to create Trackio callback")
|
120 |
+
except Exception as e:
|
121 |
+
logger.error(f"Error creating Trackio callback: {e}")
|
122 |
+
logger.info("Continuing with console monitoring only")
|
123 |
|
124 |
# Try standard Trainer first (more stable with callbacks)
|
125 |
logger.info("Creating Trainer with training arguments...")
|