File size: 6,815 Bytes
d9f7e1b
 
21d66ae
d9f7e1b
 
 
21d66ae
d9f7e1b
21d66ae
d9f7e1b
21d66ae
 
 
d9f7e1b
21d66ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9f7e1b
21d66ae
 
 
 
 
 
d9f7e1b
 
21d66ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9f7e1b
 
 
 
 
21d66ae
d9f7e1b
21d66ae
 
 
 
 
 
 
 
d9f7e1b
21d66ae
 
 
 
 
 
 
 
 
d9f7e1b
 
 
21d66ae
d9f7e1b
 
21d66ae
 
 
d9f7e1b
 
21d66ae
d9f7e1b
21d66ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9f7e1b
21d66ae
 
d9f7e1b
21d66ae
d9f7e1b
 
21d66ae
 
 
 
d9f7e1b
21d66ae
 
 
 
 
 
 
d9f7e1b
21d66ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9f7e1b
21d66ae
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env python3
"""
Test script to verify the training pipeline fixes
"""

import os
import sys
import logging
from pathlib import Path

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

def test_imports():
    """Test that all imports work correctly"""
    print("πŸ” Testing imports...")
    
    try:
        from src.config import get_config
        print("βœ… config.py imported successfully")
    except Exception as e:
        print(f"❌ config.py import failed: {e}")
        return False
    
    try:
        from src.model import SmolLM3Model
        print("βœ… model.py imported successfully")
    except Exception as e:
        print(f"❌ model.py import failed: {e}")
        return False
    
    try:
        from src.data import SmolLM3Dataset
        print("βœ… data.py imported successfully")
    except Exception as e:
        print(f"❌ data.py import failed: {e}")
        return False
    
    try:
        from src.trainer import SmolLM3Trainer
        print("βœ… trainer.py imported successfully")
    except Exception as e:
        print(f"❌ trainer.py import failed: {e}")
        return False
    
    try:
        from src.monitoring import create_monitor_from_config
        print("βœ… monitoring.py imported successfully")
    except Exception as e:
        print(f"❌ monitoring.py import failed: {e}")
        return False
    
    return True

def test_config_loading():
    """Test configuration loading"""
    print("\nπŸ” Testing configuration loading...")
    
    try:
        from src.config import get_config
        
        # Test loading the H100 lightweight config
        config = get_config("config/train_smollm3_h100_lightweight.py")
        print("βœ… Configuration loaded successfully")
        print(f"   Model: {config.model_name}")
        print(f"   Dataset: {config.dataset_name}")
        print(f"   Batch size: {config.batch_size}")
        print(f"   Learning rate: {config.learning_rate}")
        
        return True
    except Exception as e:
        print(f"❌ Configuration loading failed: {e}")
        return False

def test_monitoring_setup():
    """Test monitoring setup without Trackio Space"""
    print("\nπŸ” Testing monitoring setup...")
    
    try:
        from src.monitoring import create_monitor_from_config
        from src.config import get_config
        
        # Load config
        config = get_config("config/train_smollm3_h100_lightweight.py")
        
        # Set Trackio URL to a non-existent one to test fallback
        config.trackio_url = "https://non-existent-space.hf.space"
        config.experiment_name = "test_experiment"
        
        # Create monitor
        monitor = create_monitor_from_config(config)
        print("βœ… Monitoring setup successful")
        print(f"   Experiment: {monitor.experiment_name}")
        print(f"   Tracking enabled: {monitor.enable_tracking}")
        print(f"   HF Dataset: {monitor.dataset_repo}")
        
        return True
    except Exception as e:
        print(f"❌ Monitoring setup failed: {e}")
        return False

def test_trainer_creation():
    """Test trainer creation"""
    print("\nπŸ” Testing trainer creation...")
    
    try:
        from src.config import get_config
        from src.model import SmolLM3Model
        from src.data import SmolLM3Dataset
        from src.trainer import SmolLM3Trainer
        
        # Load config
        config = get_config("config/train_smollm3_h100_lightweight.py")
        
        # Create model (without loading the actual model)
        model = SmolLM3Model(
            model_name=config.model_name,
            max_seq_length=config.max_seq_length,
            config=config
        )
        print("βœ… Model created successfully")
        
        # Create dataset (without loading actual data)
        dataset = SmolLM3Dataset(
            data_path=config.dataset_name,
            tokenizer=model.tokenizer,
            max_seq_length=config.max_seq_length,
            config=config
        )
        print("βœ… Dataset created successfully")
        
        # Create trainer
        trainer = SmolLM3Trainer(
            model=model,
            dataset=dataset,
            config=config,
            output_dir="/tmp/test_output",
            init_from="scratch"
        )
        print("βœ… Trainer created successfully")
        
        return True
    except Exception as e:
        print(f"❌ Trainer creation failed: {e}")
        return False

def test_format_string_fix():
    """Test that the format string fix works"""
    print("\nπŸ” Testing format string fix...")
    
    try:
        from src.trainer import SmolLM3Trainer
        
        # Test the SimpleConsoleCallback format string handling
        from transformers import TrainerCallback
        
        class TestCallback(TrainerCallback):
            def on_log(self, args, state, control, logs=None, **kwargs):
                if logs and isinstance(logs, dict):
                    step = getattr(state, 'global_step', 'unknown')
                    loss = logs.get('loss', 'N/A')
                    lr = logs.get('learning_rate', 'N/A')
                    
                    # Test the fixed format string logic
                    if isinstance(loss, (int, float)):
                        loss_str = f"{loss:.4f}"
                    else:
                        loss_str = str(loss)
                    if isinstance(lr, (int, float)):
                        lr_str = f"{lr:.2e}"
                    else:
                        lr_str = str(lr)
                    
                    print(f"Step {step}: loss={loss_str}, lr={lr_str}")
        
        print("βœ… Format string fix works correctly")
        return True
    except Exception as e:
        print(f"❌ Format string fix test failed: {e}")
        return False

def main():
    """Run all tests"""
    print("πŸš€ Testing SmolLM3 Training Pipeline Fixes")
    print("=" * 50)
    
    tests = [
        test_imports,
        test_config_loading,
        test_monitoring_setup,
        test_trainer_creation,
        test_format_string_fix
    ]
    
    passed = 0
    total = len(tests)
    
    for test in tests:
        try:
            if test():
                passed += 1
        except Exception as e:
            print(f"❌ Test {test.__name__} crashed: {e}")
    
    print(f"\nπŸ“Š Test Results: {passed}/{total} tests passed")
    
    if passed == total:
        print("βœ… All tests passed! The training pipeline should work correctly.")
        return True
    else:
        print("❌ Some tests failed. Please check the errors above.")
        return False

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)