Spaces:
Running
Running
File size: 4,902 Bytes
987a674 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
#!/usr/bin/env python3
"""
Test monitoring integration for real experiment
"""
import os
import sys
from pathlib import Path
# Add the current directory to the path for imports
sys.path.insert(0, str(Path(__file__).parent))
def test_monitoring_setup():
"""Test that monitoring is correctly configured"""
print("π Testing Monitoring Integration")
print("=" * 50)
# Test 1: Check if monitoring module can be imported
try:
from monitoring import SmolLM3Monitor, create_monitor_from_config
print("β
Monitoring module imported successfully")
except ImportError as e:
print(f"β Failed to import monitoring module: {e}")
return False
# Test 2: Check if API client can be imported
try:
from trackio_api_client import TrackioAPIClient
print("β
Trackio API client imported successfully")
except ImportError as e:
print(f"β Failed to import Trackio API client: {e}")
return False
# Test 3: Test configuration loading
try:
from config.train_smollm3_openhermes_fr_a100_balanced import get_config
config = get_config("config/train_smollm3_openhermes_fr_a100_balanced.py")
print("β
Configuration loaded successfully")
print(f" Model: {config.model_name}")
print(f" Batch size: {config.batch_size}")
print(f" Max iterations: {config.max_iters}")
print(f" Enable tracking: {config.enable_tracking}")
print(f" Trackio URL: {config.trackio_url}")
except Exception as e:
print(f"β Failed to load configuration: {e}")
return False
# Test 4: Test monitor creation
try:
# Set the Trackio URL for testing
config.trackio_url = "https://tonic-test-trackio-test.hf.space"
config.experiment_name = "test_monitoring_integration"
monitor = create_monitor_from_config(config)
print("β
Monitor created successfully")
print(f" Experiment name: {monitor.experiment_name}")
print(f" Enable tracking: {monitor.enable_tracking}")
print(f" Log metrics: {monitor.log_metrics}")
print(f" Log artifacts: {monitor.log_artifacts}")
if monitor.enable_tracking and monitor.trackio_client:
print("β
Trackio client initialized")
if monitor.experiment_id:
print(f" Experiment ID: {monitor.experiment_id}")
else:
print(" β οΈ No experiment ID (will be created during training)")
else:
print(" β οΈ Trackio client not initialized")
except Exception as e:
print(f"β Failed to create monitor: {e}")
return False
# Test 5: Test callback creation
try:
callback = monitor.create_monitoring_callback()
if callback:
print("β
Monitoring callback created successfully")
else:
print(" β οΈ No monitoring callback (tracking disabled)")
except Exception as e:
print(f"β Failed to create callback: {e}")
return False
print("\n" + "=" * 50)
print("π― Monitoring Integration Test Complete")
print("=" * 50)
return True
def test_real_experiment_command():
"""Test the real experiment command"""
print("\nπ Testing Real Experiment Command")
print("=" * 50)
# Build the command
cmd = [
"python", "run_a100_large_experiment.py",
"--config", "config/train_smollm3_openhermes_fr_a100_balanced.py",
"--experiment-name", "petit-elle-l-aime-3-balanced-real",
"--output-dir", "./outputs/balanced-real",
"--trackio-url", "https://tonic-test-trackio-test.hf.space"
]
print("Command to run:")
print(" ".join(cmd))
print("\nThis command will:")
print("β
Load the balanced A100 configuration")
print("β
Create a real experiment in Trackio")
print("β
Log real training metrics every 25 steps")
print("β
Save checkpoints every 2000 steps")
print("β
Monitor progress in real-time")
print("\nExpected training parameters:")
print(" Model: HuggingFaceTB/SmolLM3-3B")
print(" Batch size: 8")
print(" Gradient accumulation: 16")
print(" Effective batch size: 128")
print(" Learning rate: 3.5e-6")
print(" Max iterations: 18000")
print(" Mixed precision: bf16")
print(" Max sequence length: 12288")
print("\n" + "=" * 50)
print("π― Ready to run real experiment!")
print("=" * 50)
if __name__ == "__main__":
# Test monitoring integration
if test_monitoring_setup():
# Show real experiment command
test_real_experiment_command()
else:
print("\nβ Monitoring integration test failed. Please fix issues before running real experiment.") |