File size: 5,738 Bytes
39db0ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
"""
Trackio Module Interface for TRL Library
Provides the interface expected by TRL library while integrating with our custom monitoring system
"""

import os
import logging
from typing import Dict, Any, Optional
from datetime import datetime

# Import our custom monitoring
from monitoring import SmolLM3Monitor

logger = logging.getLogger(__name__)

# Global monitor instance
_monitor = None

def init(
    project_name: str,
    experiment_name: Optional[str] = None,
    **kwargs
) -> str:
    """
    Initialize trackio experiment (TRL interface)
    
    Args:
        project_name: Name of the project
        experiment_name: Name of the experiment (optional)
        **kwargs: Additional configuration parameters
        
    Returns:
        Experiment ID
    """
    global _monitor
    
    try:
        # Extract configuration from kwargs
        trackio_url = kwargs.get('trackio_url') or os.environ.get('TRACKIO_URL')
        trackio_token = kwargs.get('trackio_token') or os.environ.get('TRACKIO_TOKEN')
        hf_token = kwargs.get('hf_token') or os.environ.get('HF_TOKEN')
        dataset_repo = kwargs.get('dataset_repo') or os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
        
        # Use experiment_name if provided, otherwise use project_name
        exp_name = experiment_name or project_name
        
        # Create monitor instance
        _monitor = SmolLM3Monitor(
            experiment_name=exp_name,
            trackio_url=trackio_url,
            trackio_token=trackio_token,
            enable_tracking=True,
            log_artifacts=True,
            log_metrics=True,
            log_config=True,
            hf_token=hf_token,
            dataset_repo=dataset_repo
        )
        
        # Generate experiment ID
        experiment_id = f"trl_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        _monitor.experiment_id = experiment_id
        
        logger.info(f"Trackio initialized for experiment: {exp_name}")
        logger.info(f"Experiment ID: {experiment_id}")
        
        return experiment_id
        
    except Exception as e:
        logger.error(f"Failed to initialize trackio: {e}")
        # Return a fallback experiment ID
        return f"trl_fallback_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

def log(
    metrics: Dict[str, Any],
    step: Optional[int] = None,
    **kwargs
):
    """
    Log metrics to trackio (TRL interface)
    
    Args:
        metrics: Dictionary of metrics to log
        step: Current training step
        **kwargs: Additional parameters
    """
    global _monitor
    
    try:
        if _monitor is None:
            logger.warning("Trackio not initialized, skipping log")
            return
        
        # Log metrics using our custom monitor
        _monitor.log_metrics(metrics, step)
        
        # Also log system metrics if available
        _monitor.log_system_metrics(step)
        
    except Exception as e:
        logger.error(f"Failed to log metrics: {e}")

def finish():
    """
    Finish trackio experiment (TRL interface)
    """
    global _monitor
    
    try:
        if _monitor is None:
            logger.warning("Trackio not initialized, skipping finish")
            return
        
        # Close the monitoring session
        _monitor.close()
        
        logger.info("Trackio experiment finished")
        
    except Exception as e:
        logger.error(f"Failed to finish trackio experiment: {e}")

def log_config(config: Dict[str, Any]):
    """
    Log configuration to trackio (TRL interface)
    
    Args:
        config: Configuration dictionary to log
    """
    global _monitor
    
    try:
        if _monitor is None:
            logger.warning("Trackio not initialized, skipping config log")
            return
        
        # Log configuration using our custom monitor
        _monitor.log_configuration(config)
        
    except Exception as e:
        logger.error(f"Failed to log config: {e}")

def log_checkpoint(checkpoint_path: str, step: Optional[int] = None):
    """
    Log checkpoint to trackio (TRL interface)
    
    Args:
        checkpoint_path: Path to the checkpoint file
        step: Current training step
    """
    global _monitor
    
    try:
        if _monitor is None:
            logger.warning("Trackio not initialized, skipping checkpoint log")
            return
        
        # Log checkpoint using our custom monitor
        _monitor.log_model_checkpoint(checkpoint_path, step)
        
    except Exception as e:
        logger.error(f"Failed to log checkpoint: {e}")

def log_evaluation_results(results: Dict[str, Any], step: Optional[int] = None):
    """
    Log evaluation results to trackio (TRL interface)
    
    Args:
        results: Evaluation results dictionary
        step: Current training step
    """
    global _monitor
    
    try:
        if _monitor is None:
            logger.warning("Trackio not initialized, skipping evaluation log")
            return
        
        # Log evaluation results using our custom monitor
        _monitor.log_evaluation_results(results, step)
        
    except Exception as e:
        logger.error(f"Failed to log evaluation results: {e}")

# Additional utility functions for TRL compatibility
def get_experiment_url() -> Optional[str]:
    """Get the URL to view the experiment"""
    global _monitor
    
    if _monitor is not None:
        return _monitor.get_experiment_url()
    return None

def is_available() -> bool:
    """Check if trackio is available and initialized"""
    return _monitor is not None and _monitor.enable_tracking

def get_monitor():
    """Get the current monitor instance (for advanced usage)"""
    return _monitor