SmolFactory / tests /quick_test_training.py
Tonic's picture
improves model card
3eb616f verified
raw
history blame
1.8 kB
#!/usr/bin/env python3
"""
Quick test for the training fix
"""
import os
import sys
# Add project root to path
project_root = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, project_root)
def main():
print("πŸ”§ Testing H100 Lightweight Training Fix")
print("=" * 50)
# Set environment variables to fix mixed precision issues
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"
print("βœ… Environment variables set")
# Test configuration
try:
from config.train_smollm3_h100_lightweight import SmolLM3ConfigH100Lightweight
config = SmolLM3ConfigH100Lightweight()
print(f"βœ… Configuration loaded: fp16={config.fp16}, bf16={config.bf16}")
# Test model loading (without actually loading the full model)
from src.model import SmolLM3Model
# Create model instance
model = SmolLM3Model(
model_name="HuggingFaceTB/SmolLM3-3B",
max_seq_length=4096,
config=config
)
print(f"βœ… Model dtype: {model.torch_dtype}")
print(f"βœ… Model device map: {model.device_map}")
# Test training arguments
training_args = model.get_training_arguments("/tmp/test")
print(f"βœ… Training args: fp16={training_args.fp16}, bf16={training_args.bf16}")
print("\nπŸŽ‰ All tests passed!")
print("You can now run the training with:")
print(" ./launch.sh")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return 1
return 0
if __name__ == "__main__":
exit(main())