Spaces:
Running
Running
File size: 11,603 Bytes
5fe83da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 |
"""
Trackio Monitoring Integration for SmolLM3 Fine-tuning
Provides comprehensive experiment tracking and monitoring capabilities
"""
import os
import json
import logging
from typing import Dict, Any, Optional, List
from datetime import datetime
import torch
from pathlib import Path
try:
import trackio
from trackio import TrackioClient
TRACKIO_AVAILABLE = True
except ImportError:
TRACKIO_AVAILABLE = False
print("Warning: Trackio not available. Install with: pip install trackio")
logger = logging.getLogger(__name__)
class SmolLM3Monitor:
"""Monitoring and tracking for SmolLM3 fine-tuning experiments"""
def __init__(
self,
experiment_name: str,
trackio_url: Optional[str] = None,
trackio_token: Optional[str] = None,
enable_tracking: bool = True,
log_artifacts: bool = True,
log_metrics: bool = True,
log_config: bool = True
):
self.experiment_name = experiment_name
self.enable_tracking = enable_tracking and TRACKIO_AVAILABLE
self.log_artifacts = log_artifacts
self.log_metrics = log_metrics
self.log_config = log_config
# Initialize Trackio client
self.trackio_client = None
if self.enable_tracking:
self._setup_trackio(trackio_url, trackio_token)
# Experiment metadata
self.experiment_id = None
self.start_time = datetime.now()
self.metrics_history = []
self.artifacts = []
logger.info(f"Initialized monitoring for experiment: {experiment_name}")
def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
"""Setup Trackio client"""
try:
# Get Trackio configuration from environment or parameters
url = trackio_url or os.getenv('TRACKIO_URL')
token = trackio_token or os.getenv('TRACKIO_TOKEN')
if not url:
logger.warning("Trackio URL not provided. Set TRACKIO_URL environment variable.")
self.enable_tracking = False
return
self.trackio_client = TrackioClient(
url=url,
token=token
)
# Create or get experiment
self.experiment_id = self.trackio_client.create_experiment(
name=self.experiment_name,
description=f"SmolLM3 fine-tuning experiment started at {self.start_time}"
)
logger.info(f"Trackio client initialized. Experiment ID: {self.experiment_id}")
except Exception as e:
logger.error(f"Failed to initialize Trackio: {e}")
self.enable_tracking = False
def log_config(self, config: Dict[str, Any]):
"""Log experiment configuration"""
if not self.enable_tracking or not self.log_config:
return
try:
# Log configuration as parameters
self.trackio_client.log_parameters(
experiment_id=self.experiment_id,
parameters=config
)
# Also save config locally
config_path = f"config_{self.experiment_name}_{self.start_time.strftime('%Y%m%d_%H%M%S')}.json"
with open(config_path, 'w') as f:
json.dump(config, f, indent=2, default=str)
self.artifacts.append(config_path)
logger.info(f"Configuration logged to Trackio and saved to {config_path}")
except Exception as e:
logger.error(f"Failed to log configuration: {e}")
def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None):
"""Log training metrics"""
if not self.enable_tracking or not self.log_metrics:
return
try:
# Add timestamp
metrics['timestamp'] = datetime.now().isoformat()
if step is not None:
metrics['step'] = step
# Log to Trackio
self.trackio_client.log_metrics(
experiment_id=self.experiment_id,
metrics=metrics,
step=step
)
# Store locally
self.metrics_history.append(metrics)
logger.debug(f"Metrics logged: {metrics}")
except Exception as e:
logger.error(f"Failed to log metrics: {e}")
def log_model_checkpoint(self, checkpoint_path: str, step: Optional[int] = None):
"""Log model checkpoint"""
if not self.enable_tracking or not self.log_artifacts:
return
try:
# Log checkpoint as artifact
self.trackio_client.log_artifact(
experiment_id=self.experiment_id,
file_path=checkpoint_path,
artifact_name=f"checkpoint_step_{step}" if step else "checkpoint"
)
self.artifacts.append(checkpoint_path)
logger.info(f"Checkpoint logged: {checkpoint_path}")
except Exception as e:
logger.error(f"Failed to log checkpoint: {e}")
def log_evaluation_results(self, results: Dict[str, Any], step: Optional[int] = None):
"""Log evaluation results"""
if not self.enable_tracking:
return
try:
# Add evaluation prefix to metrics
eval_metrics = {f"eval_{k}": v for k, v in results.items()}
self.log_metrics(eval_metrics, step)
# Save evaluation results locally
eval_path = f"eval_results_step_{step}_{self.start_time.strftime('%Y%m%d_%H%M%S')}.json"
with open(eval_path, 'w') as f:
json.dump(results, f, indent=2, default=str)
self.artifacts.append(eval_path)
logger.info(f"Evaluation results logged and saved to {eval_path}")
except Exception as e:
logger.error(f"Failed to log evaluation results: {e}")
def log_system_metrics(self, step: Optional[int] = None):
"""Log system metrics (GPU, memory, etc.)"""
if not self.enable_tracking:
return
try:
system_metrics = {}
# GPU metrics
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
system_metrics[f'gpu_{i}_memory_allocated'] = torch.cuda.memory_allocated(i) / 1024**3 # GB
system_metrics[f'gpu_{i}_memory_reserved'] = torch.cuda.memory_reserved(i) / 1024**3 # GB
system_metrics[f'gpu_{i}_utilization'] = torch.cuda.utilization(i) if hasattr(torch.cuda, 'utilization') else 0
# CPU and memory metrics (basic)
import psutil
system_metrics['cpu_percent'] = psutil.cpu_percent()
system_metrics['memory_percent'] = psutil.virtual_memory().percent
self.log_metrics(system_metrics, step)
except Exception as e:
logger.error(f"Failed to log system metrics: {e}")
def log_training_summary(self, summary: Dict[str, Any]):
"""Log training summary at the end"""
if not self.enable_tracking:
return
try:
# Add experiment duration
end_time = datetime.now()
duration = (end_time - self.start_time).total_seconds()
summary['experiment_duration_seconds'] = duration
summary['experiment_duration_hours'] = duration / 3600
# Log final summary
self.trackio_client.log_parameters(
experiment_id=self.experiment_id,
parameters=summary
)
# Save summary locally
summary_path = f"training_summary_{self.experiment_name}_{self.start_time.strftime('%Y%m%d_%H%M%S')}.json"
with open(summary_path, 'w') as f:
json.dump(summary, f, indent=2, default=str)
self.artifacts.append(summary_path)
logger.info(f"Training summary logged and saved to {summary_path}")
except Exception as e:
logger.error(f"Failed to log training summary: {e}")
def create_monitoring_callback(self):
"""Create a callback for integration with Hugging Face Trainer"""
if not self.enable_tracking:
return None
class TrackioCallback:
def __init__(self, monitor):
self.monitor = monitor
def on_log(self, args, state, control, logs=None, **kwargs):
"""Called when logs are created"""
if logs:
self.monitor.log_metrics(logs, state.global_step)
self.monitor.log_system_metrics(state.global_step)
def on_save(self, args, state, control, **kwargs):
"""Called when a checkpoint is saved"""
checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
if os.path.exists(checkpoint_path):
self.monitor.log_model_checkpoint(checkpoint_path, state.global_step)
def on_evaluate(self, args, state, control, metrics=None, **kwargs):
"""Called when evaluation is performed"""
if metrics:
self.monitor.log_evaluation_results(metrics, state.global_step)
return TrackioCallback(self)
def get_experiment_url(self) -> Optional[str]:
"""Get the URL to view the experiment in Trackio"""
if self.trackio_client and self.experiment_id:
return f"{self.trackio_client.url}/experiments/{self.experiment_id}"
return None
def close(self):
"""Close the monitoring session"""
if self.enable_tracking and self.trackio_client:
try:
# Mark experiment as completed
self.trackio_client.update_experiment_status(
experiment_id=self.experiment_id,
status="completed"
)
logger.info("Monitoring session closed")
except Exception as e:
logger.error(f"Failed to close monitoring session: {e}")
# Utility function to create monitor from config
def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
"""Create a monitor instance from configuration"""
if experiment_name is None:
experiment_name = f"smollm3_finetune_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
# Extract monitoring configuration
trackio_url = getattr(config, 'trackio_url', None)
trackio_token = getattr(config, 'trackio_token', None)
enable_tracking = getattr(config, 'enable_tracking', True)
log_artifacts = getattr(config, 'log_artifacts', True)
log_metrics = getattr(config, 'log_metrics', True)
log_config = getattr(config, 'log_config', True)
return SmolLM3Monitor(
experiment_name=experiment_name,
trackio_url=trackio_url,
trackio_token=trackio_token,
enable_tracking=enable_tracking,
log_artifacts=log_artifacts,
log_metrics=log_metrics,
log_config=log_config
) |