Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	adds A100 large experiments
Browse files- A100_LARGE_SCALE_GUIDE.md +195 -0
- CLOUD_DEPLOYMENT_GUIDE.md +462 -0
- CLOUD_TRAINING_GUIDE.md +440 -0
- DEPLOYMENT_GUIDE.md +397 -0
- PUSH_GUIDE.md +406 -0
- README.md +14 -1
- TRACKIO_INTEGRATION.md +252 -0
- app.py +318 -0
- cloud_deployment.sh +279 -0
- config/__init__.py +19 -0
- config/runpod_config.py +47 -0
- config/train_smollm3.py +9 -0
- config/train_smollm3_dpo.py +85 -28
- config/train_smollm3_openhermes_fr.py +129 -0
- config/train_smollm3_openhermes_fr_a100_large.py +161 -0
- config/train_smollm3_openhermes_fr_a100_multiple_passes.py +164 -0
- data.py +21 -1
- deploy_trackio_space.py +235 -0
- monitoring.py +298 -0
- push_to_huggingface.py +486 -0
- requirements.txt +8 -1
- requirements_space.txt +18 -0
- run_a100_large_experiment.py +134 -0
- test_monitoring.py +181 -0
- train.py +34 -4
- trainer.py +41 -0
    	
        A100_LARGE_SCALE_GUIDE.md
    ADDED
    
    | @@ -0,0 +1,195 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # A100 Large Scale Training Guide
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            This guide provides configurations and instructions for running fully-fledged experiments with multiple passes on the full OpenHermes-FR dataset (800k+ datapoints) using A100 GPUs.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ## Available Configurations
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            ### 1. A100 Large Batch Configuration
         | 
| 8 | 
            +
            **File**: `config/train_smollm3_openhermes_fr_a100_large.py`
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            **Key Features**:
         | 
| 11 | 
            +
            - **Effective Batch Size**: 128 (8 × 16 gradient accumulation)
         | 
| 12 | 
            +
            - **Training Duration**: ~1.3 passes (8,000 steps)
         | 
| 13 | 
            +
            - **Learning Rate**: 5e-6 (optimized for large batches)
         | 
| 14 | 
            +
            - **Mixed Precision**: bf16 (A100 optimized)
         | 
| 15 | 
            +
            - **Sequence Length**: 8192 tokens
         | 
| 16 | 
            +
            - **Memory Optimizations**: No gradient checkpointing for A100 efficiency
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            **Estimated Training Time**: ~6-8 hours on A100
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            ### 2. Multiple Passes Configuration
         | 
| 21 | 
            +
            **File**: `config/train_smollm3_openhermes_fr_a100_multiple_passes.py`
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            **Key Features**:
         | 
| 24 | 
            +
            - **Effective Batch Size**: 120 (6 × 20 gradient accumulation)
         | 
| 25 | 
            +
            - **Training Duration**: ~4 passes (25,000 steps)
         | 
| 26 | 
            +
            - **Learning Rate**: 3e-6 (conservative for long training)
         | 
| 27 | 
            +
            - **Warmup Steps**: 2000 (longer warmup for stability)
         | 
| 28 | 
            +
            - **Checkpoint Strategy**: More frequent saves (every 2000 steps)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            **Estimated Training Time**: ~20-24 hours on A100
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            ## Training Commands
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            ### Quick Start - Large Batch Experiment
         | 
| 35 | 
            +
            ```bash
         | 
| 36 | 
            +
            python run_a100_large_experiment.py \
         | 
| 37 | 
            +
                --config config/train_smollm3_openhermes_fr_a100_large.py \
         | 
| 38 | 
            +
                --experiment-name "smollm3_openhermes_fr_large_batch" \
         | 
| 39 | 
            +
                --output-dir ./outputs/large_batch
         | 
| 40 | 
            +
            ```
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            ### Multiple Passes Experiment
         | 
| 43 | 
            +
            ```bash
         | 
| 44 | 
            +
            python run_a100_large_experiment.py \
         | 
| 45 | 
            +
                --config config/train_smollm3_openhermes_fr_a100_multiple_passes.py \
         | 
| 46 | 
            +
                --experiment-name "smollm3_openhermes_fr_multiple_passes" \
         | 
| 47 | 
            +
                --output-dir ./outputs/multiple_passes
         | 
| 48 | 
            +
            ```
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            ### Dry Run (Check Configuration)
         | 
| 51 | 
            +
            ```bash
         | 
| 52 | 
            +
            python run_a100_large_experiment.py \
         | 
| 53 | 
            +
                --config config/train_smollm3_openhermes_fr_a100_large.py \
         | 
| 54 | 
            +
                --dry-run
         | 
| 55 | 
            +
            ```
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            ### Resume Training
         | 
| 58 | 
            +
            ```bash
         | 
| 59 | 
            +
            python run_a100_large_experiment.py \
         | 
| 60 | 
            +
                --config config/train_smollm3_openhermes_fr_a100_multiple_passes.py \
         | 
| 61 | 
            +
                --resume ./outputs/multiple_passes/checkpoint-10000 \
         | 
| 62 | 
            +
                --output-dir ./outputs/multiple_passes
         | 
| 63 | 
            +
            ```
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            ## Configuration Details
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            ### Memory Usage Optimization
         | 
| 68 | 
            +
            - **Gradient Checkpointing**: Disabled for A100 efficiency
         | 
| 69 | 
            +
            - **Flash Attention**: Enabled for memory efficiency
         | 
| 70 | 
            +
            - **bf16 Mixed Precision**: Better for A100 than fp16
         | 
| 71 | 
            +
            - **Gradient Clipping**: 1.0 for stability
         | 
| 72 | 
            +
            - **Group by Length**: Enabled for better batching
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            ### Data Loading Optimization
         | 
| 75 | 
            +
            - **Num Workers**: 8 for faster data loading
         | 
| 76 | 
            +
            - **Pin Memory**: Enabled for GPU transfer efficiency
         | 
| 77 | 
            +
            - **Prefetch Factor**: 2 for pipeline optimization
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            ### Training Stability
         | 
| 80 | 
            +
            - **Conservative Learning Rate**: Lower LR for large effective batch sizes
         | 
| 81 | 
            +
            - **Longer Warmup**: More warmup steps for stability
         | 
| 82 | 
            +
            - **Higher Beta2**: 0.999 for AdamW stability
         | 
| 83 | 
            +
            - **Gradient Clipping**: Prevents gradient explosion
         | 
| 84 | 
            +
             | 
| 85 | 
            +
            ## Expected Results
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            ### Large Batch Configuration (1.3 passes)
         | 
| 88 | 
            +
            - **Training Steps**: 8,000
         | 
| 89 | 
            +
            - **Effective Batch Size**: 128
         | 
| 90 | 
            +
            - **Steps per Epoch**: ~6,250
         | 
| 91 | 
            +
            - **Epochs**: ~1.3
         | 
| 92 | 
            +
            - **Expected Loss**: Should converge to ~1.5-2.0
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            ### Multiple Passes Configuration (4 passes)
         | 
| 95 | 
            +
            - **Training Steps**: 25,000
         | 
| 96 | 
            +
            - **Effective Batch Size**: 120
         | 
| 97 | 
            +
            - **Steps per Epoch**: ~6,667
         | 
| 98 | 
            +
            - **Epochs**: ~3.75
         | 
| 99 | 
            +
            - **Expected Loss**: Should converge to ~1.2-1.5
         | 
| 100 | 
            +
             | 
| 101 | 
            +
            ## Monitoring and Logging
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            ### Trackio Integration
         | 
| 104 | 
            +
            Both configurations include Trackio monitoring:
         | 
| 105 | 
            +
            - **Metrics Logging**: Every 25-50 steps
         | 
| 106 | 
            +
            - **Artifact Logging**: Model checkpoints
         | 
| 107 | 
            +
            - **Config Logging**: Training configuration
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            ### Checkpoint Strategy
         | 
| 110 | 
            +
            - **Large Batch**: Save every 1000 steps (8 checkpoints)
         | 
| 111 | 
            +
            - **Multiple Passes**: Save every 2000 steps (12 checkpoints)
         | 
| 112 | 
            +
            - **Best Model**: Automatically load best model at end
         | 
| 113 | 
            +
             | 
| 114 | 
            +
            ## Hardware Requirements
         | 
| 115 | 
            +
             | 
| 116 | 
            +
            ### Minimum Requirements
         | 
| 117 | 
            +
            - **GPU**: A100 80GB (or multiple A100s)
         | 
| 118 | 
            +
            - **RAM**: 64GB+ system RAM
         | 
| 119 | 
            +
            - **Storage**: 100GB+ for checkpoints and logs
         | 
| 120 | 
            +
            - **Network**: Fast internet for dataset download
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            ### Recommended Setup
         | 
| 123 | 
            +
            - **GPU**: 2-4x A100 80GB
         | 
| 124 | 
            +
            - **RAM**: 128GB+ system RAM
         | 
| 125 | 
            +
            - **Storage**: 500GB+ NVMe SSD
         | 
| 126 | 
            +
            - **Network**: 10Gbps+ connection
         | 
| 127 | 
            +
             | 
| 128 | 
            +
            ## Troubleshooting
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            ### Out of Memory (OOM)
         | 
| 131 | 
            +
            If you encounter OOM errors:
         | 
| 132 | 
            +
            1. Reduce `batch_size` from 8 to 6 or 4
         | 
| 133 | 
            +
            2. Increase `gradient_accumulation_steps` to maintain effective batch size
         | 
| 134 | 
            +
            3. Reduce `max_seq_length` from 8192 to 4096
         | 
| 135 | 
            +
             | 
| 136 | 
            +
            ### Slow Training
         | 
| 137 | 
            +
            If training is too slow:
         | 
| 138 | 
            +
            1. Increase `dataloader_num_workers` to 12-16
         | 
| 139 | 
            +
            2. Ensure you're using bf16 mixed precision
         | 
| 140 | 
            +
            3. Check that gradient checkpointing is disabled
         | 
| 141 | 
            +
            4. Verify flash attention is enabled
         | 
| 142 | 
            +
             | 
| 143 | 
            +
            ### Convergence Issues
         | 
| 144 | 
            +
            If loss doesn't converge:
         | 
| 145 | 
            +
            1. Reduce learning rate by 2x
         | 
| 146 | 
            +
            2. Increase warmup steps
         | 
| 147 | 
            +
            3. Check gradient norms in logs
         | 
| 148 | 
            +
            4. Verify dataset quality
         | 
| 149 | 
            +
             | 
| 150 | 
            +
            ## Customization
         | 
| 151 | 
            +
             | 
| 152 | 
            +
            ### For Different Dataset Sizes
         | 
| 153 | 
            +
            Adjust `max_iters` based on your dataset size:
         | 
| 154 | 
            +
            ```python
         | 
| 155 | 
            +
            # For 1M datapoints with effective batch size 120
         | 
| 156 | 
            +
            steps_per_epoch = 1000000 // 120  # ~8,333 steps
         | 
| 157 | 
            +
            max_iters = steps_per_epoch * desired_epochs
         | 
| 158 | 
            +
            ```
         | 
| 159 | 
            +
             | 
| 160 | 
            +
            ### For Different GPU Memory
         | 
| 161 | 
            +
            Adjust batch size and gradient accumulation:
         | 
| 162 | 
            +
            ```python
         | 
| 163 | 
            +
            # For 40GB A100
         | 
| 164 | 
            +
            batch_size = 4
         | 
| 165 | 
            +
            gradient_accumulation_steps = 32  # Effective batch size = 128
         | 
| 166 | 
            +
             | 
| 167 | 
            +
            # For 24GB GPU
         | 
| 168 | 
            +
            batch_size = 2
         | 
| 169 | 
            +
            gradient_accumulation_steps = 64  # Effective batch size = 128
         | 
| 170 | 
            +
            ```
         | 
| 171 | 
            +
             | 
| 172 | 
            +
            ## Performance Tips
         | 
| 173 | 
            +
             | 
| 174 | 
            +
            1. **Use bf16**: Better than fp16 for A100
         | 
| 175 | 
            +
            2. **Disable Gradient Checkpointing**: A100 has enough memory
         | 
| 176 | 
            +
            3. **Use Flash Attention**: Memory efficient attention
         | 
| 177 | 
            +
            4. **Group by Length**: Better batching efficiency
         | 
| 178 | 
            +
            5. **Pin Memory**: Faster GPU transfers
         | 
| 179 | 
            +
            6. **Multiple Workers**: Faster data loading
         | 
| 180 | 
            +
             | 
| 181 | 
            +
            ## Expected Timeline
         | 
| 182 | 
            +
             | 
| 183 | 
            +
            - **Large Batch**: 6-8 hours for 1.3 passes
         | 
| 184 | 
            +
            - **Multiple Passes**: 20-24 hours for 4 passes
         | 
| 185 | 
            +
            - **Full Dataset (5+ passes)**: 30+ hours
         | 
| 186 | 
            +
             | 
| 187 | 
            +
            ## Next Steps
         | 
| 188 | 
            +
             | 
| 189 | 
            +
            After training completes:
         | 
| 190 | 
            +
            1. Evaluate on validation set
         | 
| 191 | 
            +
            2. Test generation quality
         | 
| 192 | 
            +
            3. Push to Hugging Face Hub
         | 
| 193 | 
            +
            4. Deploy for inference
         | 
| 194 | 
            +
             | 
| 195 | 
            +
            For deployment instructions, see `DEPLOYMENT_GUIDE.md`. 
         | 
    	
        CLOUD_DEPLOYMENT_GUIDE.md
    ADDED
    
    | @@ -0,0 +1,462 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Cloud Deployment Guide for SmolLM3 DPO Training
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            This guide provides the exact sequence of commands to deploy and run SmolLM3 DPO training on a cloud computing instance with 6 epochs.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ## Prerequisites
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            ### Cloud Instance Requirements
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            - **GPU**: NVIDIA A100, H100, or similar (16GB+ VRAM)
         | 
| 10 | 
            +
            - **RAM**: 64GB+ system memory
         | 
| 11 | 
            +
            - **Storage**: 100GB+ SSD storage
         | 
| 12 | 
            +
            - **OS**: Ubuntu 20.04 or 22.04
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            ### Required Information
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            Before starting, gather these details:
         | 
| 17 | 
            +
            - Your Hugging Face username
         | 
| 18 | 
            +
            - Your Hugging Face token (with write permissions)
         | 
| 19 | 
            +
            - Your Trackio Space URL (if using monitoring)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            ## Step-by-Step Deployment
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            ### Step 1: Launch Cloud Instance
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            Choose your cloud provider and launch an instance:
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            #### AWS (g5.2xlarge or g5.4xlarge)
         | 
| 28 | 
            +
            ```bash
         | 
| 29 | 
            +
            # Launch instance with Ubuntu 22.04 and appropriate GPU
         | 
| 30 | 
            +
            aws ec2 run-instances \
         | 
| 31 | 
            +
                --image-id ami-0c7217cdde317cfec \
         | 
| 32 | 
            +
                --instance-type g5.2xlarge \
         | 
| 33 | 
            +
                --key-name your-key-pair \
         | 
| 34 | 
            +
                --security-group-ids sg-xxxxxxxxx
         | 
| 35 | 
            +
            ```
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            #### Google Cloud (n1-standard-8 with T4/V100)
         | 
| 38 | 
            +
            ```bash
         | 
| 39 | 
            +
            gcloud compute instances create smollm3-dpo \
         | 
| 40 | 
            +
                --zone=us-central1-a \
         | 
| 41 | 
            +
                --machine-type=n1-standard-8 \
         | 
| 42 | 
            +
                --accelerator="type=nvidia-tesla-t4,count=1" \
         | 
| 43 | 
            +
                --image-family=ubuntu-2204-lts \
         | 
| 44 | 
            +
                --image-project=ubuntu-os-cloud
         | 
| 45 | 
            +
            ```
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            #### Azure (Standard_NC6s_v3)
         | 
| 48 | 
            +
            ```bash
         | 
| 49 | 
            +
            az vm create \
         | 
| 50 | 
            +
                --resource-group your-rg \
         | 
| 51 | 
            +
                --name smollm3-dpo \
         | 
| 52 | 
            +
                --image Canonical:0001-com-ubuntu-server-jammy:22_04-lts:latest \
         | 
| 53 | 
            +
                --size Standard_NC6s_v3 \
         | 
| 54 | 
            +
                --admin-username azureuser
         | 
| 55 | 
            +
            ```
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            ### Step 2: Connect to Instance
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            ```bash
         | 
| 60 | 
            +
            # SSH to your instance
         | 
| 61 | 
            +
            ssh -i your-key.pem ubuntu@your-instance-ip
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            # Or for Azure
         | 
| 64 | 
            +
            ssh azureuser@your-instance-ip
         | 
| 65 | 
            +
            ```
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            ### Step 3: Update System and Install Dependencies
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            ```bash
         | 
| 70 | 
            +
            # Update system
         | 
| 71 | 
            +
            sudo apt-get update
         | 
| 72 | 
            +
            sudo apt-get upgrade -y
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            # Install system dependencies
         | 
| 75 | 
            +
            sudo apt-get install -y git curl wget unzip python3 python3-pip python3-venv
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            # Install NVIDIA drivers (if not pre-installed)
         | 
| 78 | 
            +
            curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
         | 
| 79 | 
            +
            curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
         | 
| 80 | 
            +
                sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
         | 
| 81 | 
            +
                sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
         | 
| 82 | 
            +
             | 
| 83 | 
            +
            sudo apt-get update
         | 
| 84 | 
            +
            sudo apt-get install -y nvidia-container-toolkit
         | 
| 85 | 
            +
            ```
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            ### Step 4: Clone Repository and Setup Environment
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            ```bash
         | 
| 90 | 
            +
            # Clone your repository
         | 
| 91 | 
            +
            git clone https://github.com/your-username/flexai-finetune.git
         | 
| 92 | 
            +
            cd flexai-finetune
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            # Create virtual environment
         | 
| 95 | 
            +
            python3 -m venv smollm3_env
         | 
| 96 | 
            +
            source smollm3_env/bin/activate
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            # Install PyTorch with CUDA
         | 
| 99 | 
            +
            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
         | 
| 100 | 
            +
             | 
| 101 | 
            +
            # Install project dependencies
         | 
| 102 | 
            +
            pip install -r requirements.txt
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            # Install additional DPO dependencies
         | 
| 105 | 
            +
            pip install trl>=0.7.0
         | 
| 106 | 
            +
            pip install peft>=0.4.0
         | 
| 107 | 
            +
            pip install accelerate>=0.20.0
         | 
| 108 | 
            +
            ```
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            ### Step 5: Configure Authentication
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            ```bash
         | 
| 113 | 
            +
            # Set your Hugging Face token
         | 
| 114 | 
            +
            export HF_TOKEN="your_huggingface_token_here"
         | 
| 115 | 
            +
             | 
| 116 | 
            +
            # Login to Hugging Face
         | 
| 117 | 
            +
            huggingface-cli login --token $HF_TOKEN
         | 
| 118 | 
            +
            ```
         | 
| 119 | 
            +
             | 
| 120 | 
            +
            ### Step 6: Create Configuration Files
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            Create the DPO configuration file:
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            ```bash
         | 
| 125 | 
            +
            cat > config/train_smollm3_dpo_6epochs.py << 'EOF'
         | 
| 126 | 
            +
            """
         | 
| 127 | 
            +
            SmolLM3 DPO Training Configuration - 6 Epochs
         | 
| 128 | 
            +
            Optimized for cloud deployment
         | 
| 129 | 
            +
            """
         | 
| 130 | 
            +
             | 
| 131 | 
            +
            from config.train_smollm3_dpo import SmolLM3DPOConfig
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            config = SmolLM3DPOConfig(
         | 
| 134 | 
            +
                # Model configuration
         | 
| 135 | 
            +
                model_name="HuggingFaceTB/SmolLM3-3B",
         | 
| 136 | 
            +
                max_seq_length=4096,
         | 
| 137 | 
            +
                use_flash_attention=True,
         | 
| 138 | 
            +
                use_gradient_checkpointing=True,
         | 
| 139 | 
            +
                
         | 
| 140 | 
            +
                # Training configuration
         | 
| 141 | 
            +
                batch_size=2,
         | 
| 142 | 
            +
                gradient_accumulation_steps=8,
         | 
| 143 | 
            +
                learning_rate=5e-6,
         | 
| 144 | 
            +
                weight_decay=0.01,
         | 
| 145 | 
            +
                warmup_steps=100,
         | 
| 146 | 
            +
                max_iters=None,  # Will be calculated based on epochs
         | 
| 147 | 
            +
                eval_interval=100,
         | 
| 148 | 
            +
                log_interval=10,
         | 
| 149 | 
            +
                save_interval=500,
         | 
| 150 | 
            +
                
         | 
| 151 | 
            +
                # DPO configuration
         | 
| 152 | 
            +
                beta=0.1,
         | 
| 153 | 
            +
                max_prompt_length=2048,
         | 
| 154 | 
            +
                
         | 
| 155 | 
            +
                # Optimizer configuration
         | 
| 156 | 
            +
                optimizer="adamw",
         | 
| 157 | 
            +
                beta1=0.9,
         | 
| 158 | 
            +
                beta2=0.95,
         | 
| 159 | 
            +
                eps=1e-8,
         | 
| 160 | 
            +
                
         | 
| 161 | 
            +
                # Scheduler configuration
         | 
| 162 | 
            +
                scheduler="cosine",
         | 
| 163 | 
            +
                min_lr=1e-6,
         | 
| 164 | 
            +
                
         | 
| 165 | 
            +
                # Mixed precision
         | 
| 166 | 
            +
                fp16=True,
         | 
| 167 | 
            +
                bf16=False,
         | 
| 168 | 
            +
                
         | 
| 169 | 
            +
                # Logging and saving
         | 
| 170 | 
            +
                save_steps=500,
         | 
| 171 | 
            +
                eval_steps=100,
         | 
| 172 | 
            +
                logging_steps=10,
         | 
| 173 | 
            +
                save_total_limit=3,
         | 
| 174 | 
            +
                
         | 
| 175 | 
            +
                # Evaluation
         | 
| 176 | 
            +
                eval_strategy="steps",
         | 
| 177 | 
            +
                metric_for_best_model="eval_loss",
         | 
| 178 | 
            +
                greater_is_better=False,
         | 
| 179 | 
            +
                load_best_model_at_end=True,
         | 
| 180 | 
            +
                
         | 
| 181 | 
            +
                # Data configuration
         | 
| 182 | 
            +
                data_dir="smoltalk_dataset",
         | 
| 183 | 
            +
                train_file="train.json",
         | 
| 184 | 
            +
                validation_file="validation.json",
         | 
| 185 | 
            +
                
         | 
| 186 | 
            +
                # Chat template configuration
         | 
| 187 | 
            +
                use_chat_template=True,
         | 
| 188 | 
            +
                chat_template_kwargs={
         | 
| 189 | 
            +
                    "enable_thinking": False,
         | 
| 190 | 
            +
                    "add_generation_prompt": True
         | 
| 191 | 
            +
                },
         | 
| 192 | 
            +
                
         | 
| 193 | 
            +
                # Trackio monitoring configuration
         | 
| 194 | 
            +
                enable_tracking=True,
         | 
| 195 | 
            +
                trackio_url="https://your-trackio-space.hf.space",  # Change this
         | 
| 196 | 
            +
                trackio_token=None,
         | 
| 197 | 
            +
                log_artifacts=True,
         | 
| 198 | 
            +
                log_metrics=True,
         | 
| 199 | 
            +
                log_config=True,
         | 
| 200 | 
            +
                experiment_name="smollm3_dpo_6epochs"
         | 
| 201 | 
            +
            )
         | 
| 202 | 
            +
            EOF
         | 
| 203 | 
            +
            ```
         | 
| 204 | 
            +
             | 
| 205 | 
            +
            ### Step 7: Download and Prepare Dataset
         | 
| 206 | 
            +
             | 
| 207 | 
            +
            ```bash
         | 
| 208 | 
            +
            # Create dataset preparation script
         | 
| 209 | 
            +
            cat > prepare_dataset.py << 'EOF'
         | 
| 210 | 
            +
            from datasets import load_dataset
         | 
| 211 | 
            +
            import json
         | 
| 212 | 
            +
            import os
         | 
| 213 | 
            +
             | 
| 214 | 
            +
            # Load SmolTalk dataset
         | 
| 215 | 
            +
            print('Loading SmolTalk dataset...')
         | 
| 216 | 
            +
            dataset = load_dataset('HuggingFaceTB/smoltalk')
         | 
| 217 | 
            +
             | 
| 218 | 
            +
            # Create dataset directory
         | 
| 219 | 
            +
            os.makedirs('smoltalk_dataset', exist_ok=True)
         | 
| 220 | 
            +
             | 
| 221 | 
            +
            # Convert to DPO format (preference pairs)
         | 
| 222 | 
            +
            def convert_to_dpo_format(example):
         | 
| 223 | 
            +
                # For SmolTalk, we'll create preference pairs based on response quality
         | 
| 224 | 
            +
                # This is a simplified example - you may need to adjust based on your needs
         | 
| 225 | 
            +
                return {
         | 
| 226 | 
            +
                    'prompt': example.get('prompt', ''),
         | 
| 227 | 
            +
                    'chosen': example.get('chosen', ''),
         | 
| 228 | 
            +
                    'rejected': example.get('rejected', '')
         | 
| 229 | 
            +
                }
         | 
| 230 | 
            +
             | 
| 231 | 
            +
            # Process train split
         | 
| 232 | 
            +
            train_data = []
         | 
| 233 | 
            +
            for example in dataset['train']:
         | 
| 234 | 
            +
                dpo_example = convert_to_dpo_format(example)
         | 
| 235 | 
            +
                if dpo_example['prompt'] and dpo_example['chosen'] and dpo_example['rejected']:
         | 
| 236 | 
            +
                    train_data.append(dpo_example)
         | 
| 237 | 
            +
             | 
| 238 | 
            +
            # Process validation split
         | 
| 239 | 
            +
            val_data = []
         | 
| 240 | 
            +
            for example in dataset['validation']:
         | 
| 241 | 
            +
                dpo_example = convert_to_dpo_format(example)
         | 
| 242 | 
            +
                if dpo_example['prompt'] and dpo_example['chosen'] and dpo_example['rejected']:
         | 
| 243 | 
            +
                    val_data.append(dpo_example)
         | 
| 244 | 
            +
             | 
| 245 | 
            +
            # Save to files
         | 
| 246 | 
            +
            with open('smoltalk_dataset/train.json', 'w') as f:
         | 
| 247 | 
            +
                json.dump(train_data, f, indent=2)
         | 
| 248 | 
            +
             | 
| 249 | 
            +
            with open('smoltalk_dataset/validation.json', 'w') as f:
         | 
| 250 | 
            +
                json.dump(val_data, f, indent=2)
         | 
| 251 | 
            +
             | 
| 252 | 
            +
            print(f'Dataset prepared: {len(train_data)} train samples, {len(val_data)} validation samples')
         | 
| 253 | 
            +
            EOF
         | 
| 254 | 
            +
             | 
| 255 | 
            +
            # Run dataset preparation
         | 
| 256 | 
            +
            python prepare_dataset.py
         | 
| 257 | 
            +
            ```
         | 
| 258 | 
            +
             | 
| 259 | 
            +
            ### Step 8: Calculate Training Parameters
         | 
| 260 | 
            +
             | 
| 261 | 
            +
            ```bash
         | 
| 262 | 
            +
            # Calculate training steps based on epochs
         | 
| 263 | 
            +
            TOTAL_SAMPLES=$(python -c "import json; data=json.load(open('smoltalk_dataset/train.json')); print(len(data))")
         | 
| 264 | 
            +
            BATCH_SIZE=2
         | 
| 265 | 
            +
            GRADIENT_ACCUMULATION_STEPS=8
         | 
| 266 | 
            +
            MAX_EPOCHS=6
         | 
| 267 | 
            +
            EFFECTIVE_BATCH_SIZE=$((BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS))
         | 
| 268 | 
            +
            STEPS_PER_EPOCH=$((TOTAL_SAMPLES / EFFECTIVE_BATCH_SIZE))
         | 
| 269 | 
            +
            MAX_STEPS=$((STEPS_PER_EPOCH * MAX_EPOCHS))
         | 
| 270 | 
            +
             | 
| 271 | 
            +
            echo "Training Configuration:"
         | 
| 272 | 
            +
            echo "  Total samples: $TOTAL_SAMPLES"
         | 
| 273 | 
            +
            echo "  Effective batch size: $EFFECTIVE_BATCH_SIZE"
         | 
| 274 | 
            +
            echo "  Steps per epoch: $STEPS_PER_EPOCH"
         | 
| 275 | 
            +
            echo "  Total training steps: $MAX_STEPS"
         | 
| 276 | 
            +
            echo "  Training epochs: $MAX_EPOCHS"
         | 
| 277 | 
            +
            ```
         | 
| 278 | 
            +
             | 
| 279 | 
            +
            ### Step 9: Start DPO Training
         | 
| 280 | 
            +
             | 
| 281 | 
            +
            ```bash
         | 
| 282 | 
            +
            # Start training with all parameters
         | 
| 283 | 
            +
            python train.py config/train_smollm3_dpo_6epochs.py \
         | 
| 284 | 
            +
                --dataset_dir smoltalk_dataset \
         | 
| 285 | 
            +
                --out_dir /output-checkpoint \
         | 
| 286 | 
            +
                --init_from scratch \
         | 
| 287 | 
            +
                --max_iters $MAX_STEPS \
         | 
| 288 | 
            +
                --batch_size $BATCH_SIZE \
         | 
| 289 | 
            +
                --learning_rate 5e-6 \
         | 
| 290 | 
            +
                --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
         | 
| 291 | 
            +
                --max_seq_length 4096 \
         | 
| 292 | 
            +
                --save_steps 500 \
         | 
| 293 | 
            +
                --eval_steps 100 \
         | 
| 294 | 
            +
                --logging_steps 10 \
         | 
| 295 | 
            +
                --enable_tracking \
         | 
| 296 | 
            +
                --trackio_url "https://your-trackio-space.hf.space" \
         | 
| 297 | 
            +
                --experiment_name "smollm3_dpo_6epochs"
         | 
| 298 | 
            +
            ```
         | 
| 299 | 
            +
             | 
| 300 | 
            +
            ### Step 10: Push Model to Hugging Face Hub
         | 
| 301 | 
            +
             | 
| 302 | 
            +
            ```bash
         | 
| 303 | 
            +
            # Push the trained model
         | 
| 304 | 
            +
            python push_to_huggingface.py /output-checkpoint "your-username/smollm3-dpo-6epochs" \
         | 
| 305 | 
            +
                --token "$HF_TOKEN" \
         | 
| 306 | 
            +
                --trackio-url "https://your-trackio-space.hf.space" \
         | 
| 307 | 
            +
                --experiment-name "smollm3_dpo_6epochs"
         | 
| 308 | 
            +
            ```
         | 
| 309 | 
            +
             | 
| 310 | 
            +
            ### Step 11: Test the Uploaded Model
         | 
| 311 | 
            +
             | 
| 312 | 
            +
            ```bash
         | 
| 313 | 
            +
            # Test the model
         | 
| 314 | 
            +
            python -c "
         | 
| 315 | 
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         | 
| 316 | 
            +
            import torch
         | 
| 317 | 
            +
             | 
| 318 | 
            +
            print('Loading uploaded model...')
         | 
| 319 | 
            +
            model = AutoModelForCausalLM.from_pretrained('your-username/smollm3-dpo-6epochs', torch_dtype=torch.float16, device_map='auto')
         | 
| 320 | 
            +
            tokenizer = AutoTokenizer.from_pretrained('your-username/smollm3-dpo-6epochs')
         | 
| 321 | 
            +
             | 
| 322 | 
            +
            print('Testing model generation...')
         | 
| 323 | 
            +
            prompt = 'Hello, how are you?'
         | 
| 324 | 
            +
            inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
         | 
| 325 | 
            +
            outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, temperature=0.7)
         | 
| 326 | 
            +
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         | 
| 327 | 
            +
            print(f'Prompt: {prompt}')
         | 
| 328 | 
            +
            print(f'Response: {response}')
         | 
| 329 | 
            +
            print('✅ Model test completed successfully!')
         | 
| 330 | 
            +
            "
         | 
| 331 | 
            +
            ```
         | 
| 332 | 
            +
             | 
| 333 | 
            +
            ## Complete One-Line Deployment
         | 
| 334 | 
            +
             | 
| 335 | 
            +
            If you want to run everything automatically, use the deployment script:
         | 
| 336 | 
            +
             | 
| 337 | 
            +
            ```bash
         | 
| 338 | 
            +
            # Make script executable
         | 
| 339 | 
            +
            chmod +x cloud_deployment.sh
         | 
| 340 | 
            +
             | 
| 341 | 
            +
            # Edit configuration in the script first
         | 
| 342 | 
            +
            nano cloud_deployment.sh
         | 
| 343 | 
            +
            # Change these variables:
         | 
| 344 | 
            +
            # - REPO_NAME="your-username/smollm3-dpo-6epochs"
         | 
| 345 | 
            +
            # - TRACKIO_URL="https://your-trackio-space.hf.space"
         | 
| 346 | 
            +
            # - HF_TOKEN="your_hf_token_here"
         | 
| 347 | 
            +
             | 
| 348 | 
            +
            # Run the complete deployment
         | 
| 349 | 
            +
            ./cloud_deployment.sh
         | 
| 350 | 
            +
            ```
         | 
| 351 | 
            +
             | 
| 352 | 
            +
            ## Monitoring and Debugging
         | 
| 353 | 
            +
             | 
| 354 | 
            +
            ### Check GPU Usage
         | 
| 355 | 
            +
             | 
| 356 | 
            +
            ```bash
         | 
| 357 | 
            +
            # Monitor GPU usage during training
         | 
| 358 | 
            +
            watch -n 1 nvidia-smi
         | 
| 359 | 
            +
            ```
         | 
| 360 | 
            +
             | 
| 361 | 
            +
            ### Check Training Logs
         | 
| 362 | 
            +
             | 
| 363 | 
            +
            ```bash
         | 
| 364 | 
            +
            # Monitor training progress
         | 
| 365 | 
            +
            tail -f training.log
         | 
| 366 | 
            +
             | 
| 367 | 
            +
            # Check system resources
         | 
| 368 | 
            +
            htop
         | 
| 369 | 
            +
            ```
         | 
| 370 | 
            +
             | 
| 371 | 
            +
            ### Monitor Trackio
         | 
| 372 | 
            +
             | 
| 373 | 
            +
            ```bash
         | 
| 374 | 
            +
            # Check if Trackio is logging properly
         | 
| 375 | 
            +
            curl -s "https://your-trackio-space.hf.space" | grep -i "experiment"
         | 
| 376 | 
            +
            ```
         | 
| 377 | 
            +
             | 
| 378 | 
            +
            ## Expected Timeline
         | 
| 379 | 
            +
             | 
| 380 | 
            +
            - **Setup**: 15-30 minutes
         | 
| 381 | 
            +
            - **Dataset preparation**: 5-10 minutes
         | 
| 382 | 
            +
            - **Training (6 epochs)**: 4-8 hours (depending on GPU)
         | 
| 383 | 
            +
            - **Model upload**: 10-30 minutes
         | 
| 384 | 
            +
            - **Testing**: 5-10 minutes
         | 
| 385 | 
            +
             | 
| 386 | 
            +
            ## Troubleshooting
         | 
| 387 | 
            +
             | 
| 388 | 
            +
            ### Common Issues
         | 
| 389 | 
            +
             | 
| 390 | 
            +
            #### 1. Out of Memory (OOM)
         | 
| 391 | 
            +
            ```bash
         | 
| 392 | 
            +
            # Reduce batch size
         | 
| 393 | 
            +
            BATCH_SIZE=1
         | 
| 394 | 
            +
            GRADIENT_ACCUMULATION_STEPS=16
         | 
| 395 | 
            +
             | 
| 396 | 
            +
            # Or use gradient checkpointing
         | 
| 397 | 
            +
            # Already enabled in config
         | 
| 398 | 
            +
            ```
         | 
| 399 | 
            +
             | 
| 400 | 
            +
            #### 2. Slow Training
         | 
| 401 | 
            +
            ```bash
         | 
| 402 | 
            +
            # Check GPU utilization
         | 
| 403 | 
            +
            nvidia-smi
         | 
| 404 | 
            +
             | 
| 405 | 
            +
            # Check if mixed precision is working
         | 
| 406 | 
            +
            # Look for "fp16" in training logs
         | 
| 407 | 
            +
            ```
         | 
| 408 | 
            +
             | 
| 409 | 
            +
            #### 3. Dataset Issues
         | 
| 410 | 
            +
            ```bash
         | 
| 411 | 
            +
            # Check dataset format
         | 
| 412 | 
            +
            head -n 5 smoltalk_dataset/train.json
         | 
| 413 | 
            +
             | 
| 414 | 
            +
            # Verify dataset size
         | 
| 415 | 
            +
            wc -l smoltalk_dataset/train.json
         | 
| 416 | 
            +
            ```
         | 
| 417 | 
            +
             | 
| 418 | 
            +
            #### 4. Authentication Issues
         | 
| 419 | 
            +
            ```bash
         | 
| 420 | 
            +
            # Test HF token
         | 
| 421 | 
            +
            python -c "
         | 
| 422 | 
            +
            from huggingface_hub import HfApi
         | 
| 423 | 
            +
            api = HfApi(token='$HF_TOKEN')
         | 
| 424 | 
            +
            print('Token is valid!')
         | 
| 425 | 
            +
            "
         | 
| 426 | 
            +
            ```
         | 
| 427 | 
            +
             | 
| 428 | 
            +
            ## Cost Estimation
         | 
| 429 | 
            +
             | 
| 430 | 
            +
            ### AWS (g5.2xlarge)
         | 
| 431 | 
            +
            - **Instance**: $0.526/hour
         | 
| 432 | 
            +
            - **Training time**: 6 hours
         | 
| 433 | 
            +
            - **Total cost**: ~$3.16
         | 
| 434 | 
            +
             | 
| 435 | 
            +
            ### Google Cloud (n1-standard-8 + T4)
         | 
| 436 | 
            +
            - **Instance**: $0.38/hour
         | 
| 437 | 
            +
            - **Training time**: 6 hours
         | 
| 438 | 
            +
            - **Total cost**: ~$2.28
         | 
| 439 | 
            +
             | 
| 440 | 
            +
            ### Azure (Standard_NC6s_v3)
         | 
| 441 | 
            +
            - **Instance**: $0.90/hour
         | 
| 442 | 
            +
            - **Training time**: 6 hours
         | 
| 443 | 
            +
            - **Total cost**: ~$5.40
         | 
| 444 | 
            +
             | 
| 445 | 
            +
            ## Next Steps
         | 
| 446 | 
            +
             | 
| 447 | 
            +
            After successful deployment:
         | 
| 448 | 
            +
             | 
| 449 | 
            +
            1. **Monitor training** in your Trackio Space
         | 
| 450 | 
            +
            2. **Check model repository** on Hugging Face Hub
         | 
| 451 | 
            +
            3. **Test the model** with different prompts
         | 
| 452 | 
            +
            4. **Share your model** with the community
         | 
| 453 | 
            +
            5. **Iterate and improve** based on results
         | 
| 454 | 
            +
             | 
| 455 | 
            +
            ## Support
         | 
| 456 | 
            +
             | 
| 457 | 
            +
            - **Training issues**: Check logs and GPU utilization
         | 
| 458 | 
            +
            - **Upload issues**: Verify HF token and repository permissions
         | 
| 459 | 
            +
            - **Monitoring issues**: Check Trackio Space configuration
         | 
| 460 | 
            +
            - **Performance issues**: Adjust batch size and learning rate
         | 
| 461 | 
            +
             | 
| 462 | 
            +
            Your SmolLM3 DPO model will be ready for use after training completes! 
         | 
    	
        CLOUD_TRAINING_GUIDE.md
    ADDED
    
    | @@ -0,0 +1,440 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Cloud Training Guide for OpenHermes-FR Dataset
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            This guide provides step-by-step instructions for training SmolLM3 models on cloud instances using the [legmlai/openhermes-fr](https://huggingface.co/datasets/legmlai/openhermes-fr) dataset.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ## Overview
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            The OpenHermes-FR dataset contains 799,875 French instruction-response pairs, perfect for fine-tuning SmolLM3 models for French language tasks. This guide covers:
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            - ✅ **Cloud Instance Setup** - Complete environment configuration
         | 
| 10 | 
            +
            - ✅ **Dataset Integration** - Automatic loading and filtering
         | 
| 11 | 
            +
            - ✅ **Training Configuration** - Optimized for French instruction tuning
         | 
| 12 | 
            +
            - ✅ **Monitoring Integration** - Trackio experiment tracking
         | 
| 13 | 
            +
            - ✅ **Model Deployment** - Push to Hugging Face Hub
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            ## Dataset Information
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            ### Schema
         | 
| 18 | 
            +
            ```json
         | 
| 19 | 
            +
            {
         | 
| 20 | 
            +
              "prompt": "Explique la différence entre la photosynthèse C3 et C4.",
         | 
| 21 | 
            +
              "accepted_completion": "La photosynthèse C3 utilise… (réponse détaillée)",
         | 
| 22 | 
            +
              "bad_prompt_detected": false,
         | 
| 23 | 
            +
              "bad_response_detected": false,
         | 
| 24 | 
            +
              "bad_entry": false
         | 
| 25 | 
            +
            }
         | 
| 26 | 
            +
            ```
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            ### Key Features
         | 
| 29 | 
            +
            - **Size**: 799,875 examples (~1.4GB)
         | 
| 30 | 
            +
            - **Language**: 100% French
         | 
| 31 | 
            +
            - **Quality**: GPT-4o generated responses with automatic filtering
         | 
| 32 | 
            +
            - **License**: ODC-BY 1.0
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            ## Cloud Instance Setup
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            ### 1. Choose Your Cloud Provider
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            #### **AWS EC2 (Recommended)**
         | 
| 39 | 
            +
            ```bash
         | 
| 40 | 
            +
            # Launch instance with GPU
         | 
| 41 | 
            +
            # Recommended: g4dn.xlarge or g5.xlarge
         | 
| 42 | 
            +
            # AMI: Deep Learning AMI (Ubuntu 20.04)
         | 
| 43 | 
            +
            ```
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            #### **Google Cloud Platform**
         | 
| 46 | 
            +
            ```bash
         | 
| 47 | 
            +
            # Launch instance with GPU
         | 
| 48 | 
            +
            # Recommended: n1-standard-4 with Tesla T4 or V100
         | 
| 49 | 
            +
            ```
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            #### **Azure**
         | 
| 52 | 
            +
            ```bash
         | 
| 53 | 
            +
            # Launch instance with GPU
         | 
| 54 | 
            +
            # Recommended: Standard_NC6s_v3 or Standard_NC12s_v3
         | 
| 55 | 
            +
            ```
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            ### 2. Instance Specifications
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            #### **Minimum Requirements**
         | 
| 60 | 
            +
            - **GPU**: 16GB+ VRAM (Tesla T4, V100, or A100)
         | 
| 61 | 
            +
            - **RAM**: 32GB+ system memory
         | 
| 62 | 
            +
            - **Storage**: 100GB+ SSD
         | 
| 63 | 
            +
            - **CPU**: 8+ cores
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            #### **Recommended Specifications**
         | 
| 66 | 
            +
            - **GPU**: A100 (40GB) or H100 (80GB)
         | 
| 67 | 
            +
            - **RAM**: 64GB+ system memory
         | 
| 68 | 
            +
            - **Storage**: 200GB+ NVMe SSD
         | 
| 69 | 
            +
            - **CPU**: 16+ cores
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            ### 3. Environment Setup
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            ```bash
         | 
| 74 | 
            +
            # Update system
         | 
| 75 | 
            +
            sudo apt update && sudo apt upgrade -y
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            # Install CUDA (if not pre-installed)
         | 
| 78 | 
            +
            # Follow NVIDIA CUDA installation guide for your GPU
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            # Install Python dependencies
         | 
| 81 | 
            +
            sudo apt install python3-pip python3-venv git -y
         | 
| 82 | 
            +
             | 
| 83 | 
            +
            # Create virtual environment
         | 
| 84 | 
            +
            python3 -m venv smollm3_env
         | 
| 85 | 
            +
            source smollm3_env/bin/activate
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            # Clone repository
         | 
| 88 | 
            +
            git clone <your-repo-url>
         | 
| 89 | 
            +
            cd <your-repo-directory>
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            # Install dependencies
         | 
| 92 | 
            +
            pip install -r requirements.txt
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            # Install additional dependencies for cloud training
         | 
| 95 | 
            +
            pip install accelerate transformers datasets huggingface_hub
         | 
| 96 | 
            +
            ```
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            ## Training Configuration
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            ### 1. Use the OpenHermes-FR Config
         | 
| 101 | 
            +
             | 
| 102 | 
            +
            The repository includes a specialized configuration for the OpenHermes-FR dataset:
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            ```bash
         | 
| 105 | 
            +
            python train.py config/train_smollm3_openhermes_fr.py \
         | 
| 106 | 
            +
                --enable_tracking \
         | 
| 107 | 
            +
                --trackio_url "https://your-space.hf.space" \
         | 
| 108 | 
            +
                --experiment_name "smollm3_fr_openhermes_v1"
         | 
| 109 | 
            +
            ```
         | 
| 110 | 
            +
             | 
| 111 | 
            +
            ### 2. Configuration Details
         | 
| 112 | 
            +
             | 
| 113 | 
            +
            The `config/train_smollm3_openhermes_fr.py` includes:
         | 
| 114 | 
            +
             | 
| 115 | 
            +
            #### **Dataset Configuration**
         | 
| 116 | 
            +
            ```python
         | 
| 117 | 
            +
            dataset_name: str = "legmlai/openhermes-fr"
         | 
| 118 | 
            +
            dataset_split: str = "train"
         | 
| 119 | 
            +
            input_field: str = "prompt"
         | 
| 120 | 
            +
            target_field: str = "accepted_completion"
         | 
| 121 | 
            +
            filter_bad_entries: bool = True
         | 
| 122 | 
            +
            bad_entry_field: str = "bad_entry"
         | 
| 123 | 
            +
            ```
         | 
| 124 | 
            +
             | 
| 125 | 
            +
            #### **Training Optimization**
         | 
| 126 | 
            +
            ```python
         | 
| 127 | 
            +
            batch_size: int = 2  # Reduced for French text (longer sequences)
         | 
| 128 | 
            +
            gradient_accumulation_steps: int = 8  # Maintains effective batch size
         | 
| 129 | 
            +
            learning_rate: float = 1e-5  # Lower for instruction tuning
         | 
| 130 | 
            +
            max_iters: int = 2000  # More iterations for large dataset
         | 
| 131 | 
            +
            ```
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            #### **Monitoring Integration**
         | 
| 134 | 
            +
            ```python
         | 
| 135 | 
            +
            enable_tracking: bool = True
         | 
| 136 | 
            +
            experiment_name: str = "smollm3_openhermes_fr"
         | 
| 137 | 
            +
            ```
         | 
| 138 | 
            +
             | 
| 139 | 
            +
            ## Training Commands
         | 
| 140 | 
            +
             | 
| 141 | 
            +
            ### Basic Training
         | 
| 142 | 
            +
            ```bash
         | 
| 143 | 
            +
            python train.py config/train_smollm3_openhermes_fr.py
         | 
| 144 | 
            +
            ```
         | 
| 145 | 
            +
             | 
| 146 | 
            +
            ### Training with Monitoring
         | 
| 147 | 
            +
            ```bash
         | 
| 148 | 
            +
            python train.py config/train_smollm3_openhermes_fr.py \
         | 
| 149 | 
            +
                --enable_tracking \
         | 
| 150 | 
            +
                --trackio_url "https://your-trackio-space.hf.space" \
         | 
| 151 | 
            +
                --experiment_name "smollm3_fr_openhermes_v1"
         | 
| 152 | 
            +
            ```
         | 
| 153 | 
            +
             | 
| 154 | 
            +
            ### Training with Custom Parameters
         | 
| 155 | 
            +
            ```bash
         | 
| 156 | 
            +
            python train.py config/train_smollm3_openhermes_fr.py \
         | 
| 157 | 
            +
                --batch_size 4 \
         | 
| 158 | 
            +
                --learning_rate 2e-5 \
         | 
| 159 | 
            +
                --max_iters 3000 \
         | 
| 160 | 
            +
                --enable_tracking \
         | 
| 161 | 
            +
                --trackio_url "https://your-trackio-space.hf.space" \
         | 
| 162 | 
            +
                --experiment_name "smollm3_fr_high_lr"
         | 
| 163 | 
            +
            ```
         | 
| 164 | 
            +
             | 
| 165 | 
            +
            ### Training with Checkpoint Resume
         | 
| 166 | 
            +
            ```bash
         | 
| 167 | 
            +
            python train.py config/train_smollm3_openhermes_fr.py \
         | 
| 168 | 
            +
                --init_from resume \
         | 
| 169 | 
            +
                --enable_tracking \
         | 
| 170 | 
            +
                --trackio_url "https://your-trackio-space.hf.space" \
         | 
| 171 | 
            +
                --experiment_name "smollm3_fr_resume"
         | 
| 172 | 
            +
            ```
         | 
| 173 | 
            +
             | 
| 174 | 
            +
            ## Dataset Processing
         | 
| 175 | 
            +
             | 
| 176 | 
            +
            ### Automatic Filtering
         | 
| 177 | 
            +
             | 
| 178 | 
            +
            The training script automatically:
         | 
| 179 | 
            +
            - ✅ **Loads** the OpenHermes-FR dataset from Hugging Face
         | 
| 180 | 
            +
            - ✅ **Filters** out bad entries (`bad_entry = true`)
         | 
| 181 | 
            +
            - ✅ **Splits** data into train/validation/test (98/1/1)
         | 
| 182 | 
            +
            - ✅ **Formats** prompts and completions for instruction tuning
         | 
| 183 | 
            +
             | 
| 184 | 
            +
            ### Manual Dataset Inspection
         | 
| 185 | 
            +
             | 
| 186 | 
            +
            ```python
         | 
| 187 | 
            +
            from datasets import load_dataset
         | 
| 188 | 
            +
             | 
| 189 | 
            +
            # Load dataset
         | 
| 190 | 
            +
            dataset = load_dataset("legmlai/openhermes-fr")
         | 
| 191 | 
            +
             | 
| 192 | 
            +
            # Check dataset info
         | 
| 193 | 
            +
            print(f"Dataset size: {len(dataset['train'])}")
         | 
| 194 | 
            +
            print(f"Sample columns: {dataset['train'].column_names}")
         | 
| 195 | 
            +
             | 
| 196 | 
            +
            # Check filtering
         | 
| 197 | 
            +
            bad_entries = dataset['train'].filter(lambda x: x['bad_entry'])
         | 
| 198 | 
            +
            print(f"Bad entries: {len(bad_entries)}")
         | 
| 199 | 
            +
             | 
| 200 | 
            +
            # Sample data
         | 
| 201 | 
            +
            sample = dataset['train'][0]
         | 
| 202 | 
            +
            print(f"Prompt: {sample['prompt']}")
         | 
| 203 | 
            +
            print(f"Completion: {sample['accepted_completion']}")
         | 
| 204 | 
            +
            ```
         | 
| 205 | 
            +
             | 
| 206 | 
            +
            ## Monitoring and Tracking
         | 
| 207 | 
            +
             | 
| 208 | 
            +
            ### Trackio Integration
         | 
| 209 | 
            +
             | 
| 210 | 
            +
            The training automatically logs:
         | 
| 211 | 
            +
            - **Training metrics**: Loss, accuracy, learning rate
         | 
| 212 | 
            +
            - **System metrics**: GPU memory, CPU usage
         | 
| 213 | 
            +
            - **Dataset info**: Size, filtering statistics
         | 
| 214 | 
            +
            - **Model checkpoints**: Regular saves with metadata
         | 
| 215 | 
            +
             | 
| 216 | 
            +
            ### View Training Progress
         | 
| 217 | 
            +
             | 
| 218 | 
            +
            1. **Trackio Space**: Visit your Trackio Space URL
         | 
| 219 | 
            +
            2. **Experiment Details**: Check the "View Experiments" tab
         | 
| 220 | 
            +
            3. **Metrics**: Monitor loss curves and system usage
         | 
| 221 | 
            +
            4. **Logs**: Download training logs for analysis
         | 
| 222 | 
            +
             | 
| 223 | 
            +
            ## Model Deployment
         | 
| 224 | 
            +
             | 
| 225 | 
            +
            ### Push to Hugging Face Hub
         | 
| 226 | 
            +
             | 
| 227 | 
            +
            After training, deploy your model:
         | 
| 228 | 
            +
             | 
| 229 | 
            +
            ```bash
         | 
| 230 | 
            +
            python push_to_huggingface.py /output-checkpoint username/smollm3-fr-openhermes \
         | 
| 231 | 
            +
                --trackio-url "https://your-trackio-space.hf.space" \
         | 
| 232 | 
            +
                --experiment-name "smollm3_fr_openhermes_v1"
         | 
| 233 | 
            +
            ```
         | 
| 234 | 
            +
             | 
| 235 | 
            +
            ### Use Your Model
         | 
| 236 | 
            +
             | 
| 237 | 
            +
            ```python
         | 
| 238 | 
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         | 
| 239 | 
            +
             | 
| 240 | 
            +
            # Load your fine-tuned model
         | 
| 241 | 
            +
            model = AutoModelForCausalLM.from_pretrained("username/smollm3-fr-openhermes")
         | 
| 242 | 
            +
            tokenizer = AutoTokenizer.from_pretrained("username/smollm3-fr-openhermes")
         | 
| 243 | 
            +
             | 
| 244 | 
            +
            # Generate French text
         | 
| 245 | 
            +
            prompt = "Expliquez le concept de l'intelligence artificielle."
         | 
| 246 | 
            +
            inputs = tokenizer(prompt, return_tensors="pt")
         | 
| 247 | 
            +
            outputs = model.generate(**inputs, max_new_tokens=200)
         | 
| 248 | 
            +
            print(tokenizer.decode(outputs[0], skip_special_tokens=True))
         | 
| 249 | 
            +
            ```
         | 
| 250 | 
            +
             | 
| 251 | 
            +
            ## Performance Optimization
         | 
| 252 | 
            +
             | 
| 253 | 
            +
            ### GPU Memory Management
         | 
| 254 | 
            +
             | 
| 255 | 
            +
            ```bash
         | 
| 256 | 
            +
            # Monitor GPU usage
         | 
| 257 | 
            +
            nvidia-smi -l 1
         | 
| 258 | 
            +
             | 
| 259 | 
            +
            # Optimize for your GPU
         | 
| 260 | 
            +
            # For 16GB VRAM: batch_size=2, gradient_accumulation_steps=8
         | 
| 261 | 
            +
            # For 24GB VRAM: batch_size=4, gradient_accumulation_steps=4
         | 
| 262 | 
            +
            # For 40GB+ VRAM: batch_size=8, gradient_accumulation_steps=2
         | 
| 263 | 
            +
            ```
         | 
| 264 | 
            +
             | 
| 265 | 
            +
            ### Training Speed
         | 
| 266 | 
            +
             | 
| 267 | 
            +
            ```bash
         | 
| 268 | 
            +
            # Use mixed precision (enabled by default)
         | 
| 269 | 
            +
            fp16: bool = True
         | 
| 270 | 
            +
             | 
| 271 | 
            +
            # Enable gradient checkpointing (enabled by default)
         | 
| 272 | 
            +
            use_gradient_checkpointing: bool = True
         | 
| 273 | 
            +
             | 
| 274 | 
            +
            # Use flash attention (enabled by default)
         | 
| 275 | 
            +
            use_flash_attention: bool = True
         | 
| 276 | 
            +
            ```
         | 
| 277 | 
            +
             | 
| 278 | 
            +
            ## Troubleshooting
         | 
| 279 | 
            +
             | 
| 280 | 
            +
            ### Common Issues
         | 
| 281 | 
            +
             | 
| 282 | 
            +
            #### 1. **Out of Memory (OOM)**
         | 
| 283 | 
            +
            ```bash
         | 
| 284 | 
            +
            # Reduce batch size
         | 
| 285 | 
            +
            python train.py config/train_smollm3_openhermes_fr.py --batch_size 1
         | 
| 286 | 
            +
             | 
| 287 | 
            +
            # Increase gradient accumulation
         | 
| 288 | 
            +
            # Edit config: gradient_accumulation_steps = 16
         | 
| 289 | 
            +
            ```
         | 
| 290 | 
            +
             | 
| 291 | 
            +
            #### 2. **Slow Training**
         | 
| 292 | 
            +
            ```bash
         | 
| 293 | 
            +
            # Check GPU utilization
         | 
| 294 | 
            +
            nvidia-smi
         | 
| 295 | 
            +
             | 
| 296 | 
            +
            # Verify data loading
         | 
| 297 | 
            +
            # Check if dataset is cached locally
         | 
| 298 | 
            +
            ```
         | 
| 299 | 
            +
             | 
| 300 | 
            +
            #### 3. **Dataset Loading Issues**
         | 
| 301 | 
            +
            ```bash
         | 
| 302 | 
            +
            # Clear cache
         | 
| 303 | 
            +
            rm -rf ~/.cache/huggingface/
         | 
| 304 | 
            +
             | 
| 305 | 
            +
            # Check internet connection
         | 
| 306 | 
            +
            # Verify dataset name: "legmlai/openhermes-fr"
         | 
| 307 | 
            +
            ```
         | 
| 308 | 
            +
             | 
| 309 | 
            +
            #### 4. **Monitoring Connection Issues**
         | 
| 310 | 
            +
            ```bash
         | 
| 311 | 
            +
            # Test Trackio connection
         | 
| 312 | 
            +
            curl -I https://your-trackio-space.hf.space
         | 
| 313 | 
            +
             | 
| 314 | 
            +
            # Check token permissions
         | 
| 315 | 
            +
            # Verify experiment name format
         | 
| 316 | 
            +
            ```
         | 
| 317 | 
            +
             | 
| 318 | 
            +
            ### Debug Mode
         | 
| 319 | 
            +
             | 
| 320 | 
            +
            ```bash
         | 
| 321 | 
            +
            # Enable debug logging
         | 
| 322 | 
            +
            export LOG_LEVEL=DEBUG
         | 
| 323 | 
            +
            python train.py config/train_smollm3_openhermes_fr.py
         | 
| 324 | 
            +
            ```
         | 
| 325 | 
            +
             | 
| 326 | 
            +
            ## Cost Optimization
         | 
| 327 | 
            +
             | 
| 328 | 
            +
            ### Cloud Provider Tips
         | 
| 329 | 
            +
             | 
| 330 | 
            +
            #### **AWS EC2**
         | 
| 331 | 
            +
            - Use Spot Instances for cost savings
         | 
| 332 | 
            +
            - Monitor usage with CloudWatch
         | 
| 333 | 
            +
            - Use appropriate instance types
         | 
| 334 | 
            +
             | 
| 335 | 
            +
            #### **Google Cloud Platform**
         | 
| 336 | 
            +
            - Use Preemptible VMs for non-critical training
         | 
| 337 | 
            +
            - Monitor with Cloud Monitoring
         | 
| 338 | 
            +
            - Use committed use discounts
         | 
| 339 | 
            +
             | 
| 340 | 
            +
            #### **Azure**
         | 
| 341 | 
            +
            - Use Spot VMs for cost optimization
         | 
| 342 | 
            +
            - Monitor with Azure Monitor
         | 
| 343 | 
            +
            - Use reserved instances for long training
         | 
| 344 | 
            +
             | 
| 345 | 
            +
            ### Training Time Estimates
         | 
| 346 | 
            +
             | 
| 347 | 
            +
            | GPU Type | Batch Size | Estimated Time |
         | 
| 348 | 
            +
            |----------|------------|----------------|
         | 
| 349 | 
            +
            | Tesla T4 (16GB) | 2 | 8-12 hours |
         | 
| 350 | 
            +
            | V100 (32GB) | 4 | 4-6 hours |
         | 
| 351 | 
            +
            | A100 (40GB) | 8 | 2-3 hours |
         | 
| 352 | 
            +
            | H100 (80GB) | 16 | 1-2 hours |
         | 
| 353 | 
            +
             | 
| 354 | 
            +
            ## Security Best Practices
         | 
| 355 | 
            +
             | 
| 356 | 
            +
            ### Token Management
         | 
| 357 | 
            +
            ```bash
         | 
| 358 | 
            +
            # Use environment variables
         | 
| 359 | 
            +
            export HF_TOKEN="your_token_here"
         | 
| 360 | 
            +
            export TRACKIO_TOKEN="your_trackio_token"
         | 
| 361 | 
            +
             | 
| 362 | 
            +
            # Don't hardcode in scripts
         | 
| 363 | 
            +
            # Use IAM roles when possible
         | 
| 364 | 
            +
            ```
         | 
| 365 | 
            +
             | 
| 366 | 
            +
            ### Data Privacy
         | 
| 367 | 
            +
            ```bash
         | 
| 368 | 
            +
            # Use private repositories for sensitive models
         | 
| 369 | 
            +
            python push_to_huggingface.py model username/private-model --private
         | 
| 370 | 
            +
             | 
| 371 | 
            +
            # Secure your cloud instance
         | 
| 372 | 
            +
            # Use VPC and security groups
         | 
| 373 | 
            +
            ```
         | 
| 374 | 
            +
             | 
| 375 | 
            +
            ## Complete Workflow Example
         | 
| 376 | 
            +
             | 
| 377 | 
            +
            ### 1. Setup Cloud Instance
         | 
| 378 | 
            +
            ```bash
         | 
| 379 | 
            +
            # Launch GPU instance
         | 
| 380 | 
            +
            # Install dependencies
         | 
| 381 | 
            +
            git clone <your-repo>
         | 
| 382 | 
            +
            cd <your-repo>
         | 
| 383 | 
            +
            pip install -r requirements.txt
         | 
| 384 | 
            +
            ```
         | 
| 385 | 
            +
             | 
| 386 | 
            +
            ### 2. Train Model
         | 
| 387 | 
            +
            ```bash
         | 
| 388 | 
            +
            python train.py config/train_smollm3_openhermes_fr.py \
         | 
| 389 | 
            +
                --enable_tracking \
         | 
| 390 | 
            +
                --trackio_url "https://your-space.hf.space" \
         | 
| 391 | 
            +
                --experiment_name "smollm3_fr_v1"
         | 
| 392 | 
            +
            ```
         | 
| 393 | 
            +
             | 
| 394 | 
            +
            ### 3. Deploy Model
         | 
| 395 | 
            +
            ```bash
         | 
| 396 | 
            +
            python push_to_huggingface.py /output-checkpoint username/smollm3-fr-v1 \
         | 
| 397 | 
            +
                --trackio-url "https://your-space.hf.space" \
         | 
| 398 | 
            +
                --experiment-name "smollm3_fr_v1"
         | 
| 399 | 
            +
            ```
         | 
| 400 | 
            +
             | 
| 401 | 
            +
            ### 4. Test Model
         | 
| 402 | 
            +
            ```python
         | 
| 403 | 
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         | 
| 404 | 
            +
             | 
| 405 | 
            +
            model = AutoModelForCausalLM.from_pretrained("username/smollm3-fr-v1")
         | 
| 406 | 
            +
            tokenizer = AutoTokenizer.from_pretrained("username/smollm3-fr-v1")
         | 
| 407 | 
            +
             | 
| 408 | 
            +
            # Test French generation
         | 
| 409 | 
            +
            prompt = "Qu'est-ce que l'apprentissage automatique?"
         | 
| 410 | 
            +
            inputs = tokenizer(prompt, return_tensors="pt")
         | 
| 411 | 
            +
            outputs = model.generate(**inputs, max_new_tokens=100)
         | 
| 412 | 
            +
            print(tokenizer.decode(outputs[0], skip_special_tokens=True))
         | 
| 413 | 
            +
            ```
         | 
| 414 | 
            +
             | 
| 415 | 
            +
            ## Support and Resources
         | 
| 416 | 
            +
             | 
| 417 | 
            +
            ### Documentation
         | 
| 418 | 
            +
            - [OpenHermes-FR Dataset](https://huggingface.co/datasets/legmlai/openhermes-fr)
         | 
| 419 | 
            +
            - [SmolLM3 Model](https://huggingface.co/HuggingFaceTB/SmolLM3-3B)
         | 
| 420 | 
            +
            - [Trackio Monitoring](https://github.com/Josephrp/trackio)
         | 
| 421 | 
            +
             | 
| 422 | 
            +
            ### Community
         | 
| 423 | 
            +
            - [Hugging Face Forums](https://discuss.huggingface.co/)
         | 
| 424 | 
            +
            - [Transformers Documentation](https://huggingface.co/docs/transformers/)
         | 
| 425 | 
            +
             | 
| 426 | 
            +
            ### Examples
         | 
| 427 | 
            +
            - [French Language Models](https://huggingface.co/models?search=french)
         | 
| 428 | 
            +
            - [Instruction Tuned Models](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads)
         | 
| 429 | 
            +
             | 
| 430 | 
            +
            ## Conclusion
         | 
| 431 | 
            +
             | 
| 432 | 
            +
            This guide provides everything needed to train SmolLM3 models on the OpenHermes-FR dataset in the cloud:
         | 
| 433 | 
            +
             | 
| 434 | 
            +
            - ✅ **Complete Setup** - From cloud instance to model deployment
         | 
| 435 | 
            +
            - ✅ **Optimized Configuration** - Tailored for French instruction tuning
         | 
| 436 | 
            +
            - ✅ **Monitoring Integration** - Trackio experiment tracking
         | 
| 437 | 
            +
            - ✅ **Cost Optimization** - Tips for efficient cloud usage
         | 
| 438 | 
            +
            - ✅ **Troubleshooting** - Solutions for common issues
         | 
| 439 | 
            +
             | 
| 440 | 
            +
            Start training your French language model today! 
         | 
    	
        DEPLOYMENT_GUIDE.md
    ADDED
    
    | @@ -0,0 +1,397 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Trackio Deployment Guide for Hugging Face Spaces
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            This guide provides step-by-step instructions for deploying Trackio experiment tracking to Hugging Face Spaces and integrating it with your SmolLM3 fine-tuning pipeline.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ## Prerequisites
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            - Hugging Face account
         | 
| 8 | 
            +
            - Hugging Face CLI installed (`pip install huggingface_hub`)
         | 
| 9 | 
            +
            - Git configured with your Hugging Face credentials
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            ## Method 1: Automated Deployment (Recommended)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            ### Step 1: Run the Deployment Script
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            ```bash
         | 
| 16 | 
            +
            python deploy_trackio_space.py
         | 
| 17 | 
            +
            ```
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            The script will prompt you for:
         | 
| 20 | 
            +
            - Your Hugging Face username
         | 
| 21 | 
            +
            - Space name (e.g., `trackio-monitoring`)
         | 
| 22 | 
            +
            - Hugging Face token (needs a write token obviously)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            ### Step 2: Wait for Build
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            After deployment, wait 2-5 minutes for the Space to build and become available.
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            ### Step 3: Test the Interface
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            Visit your Space URL to test the interface:
         | 
| 31 | 
            +
            ```
         | 
| 32 | 
            +
            https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
         | 
| 33 | 
            +
            ```
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            ## Method 2: Manual Deployment
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            ### Step 1: Create a New Space
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            1. Go to https://huggingface.co/spaces
         | 
| 40 | 
            +
            2. Click "Create new Space"
         | 
| 41 | 
            +
            3. Configure the Space:
         | 
| 42 | 
            +
               - **Owner**: Your username
         | 
| 43 | 
            +
               - **Space name**: `trackio-monitoring` (or your preferred name)
         | 
| 44 | 
            +
               - **SDK**: Gradio
         | 
| 45 | 
            +
               - **Hardware**: CPU (Basic)
         | 
| 46 | 
            +
               - **License**: MIT
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            ### Step 2: Upload Files
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            Upload these files to your Space:
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            #### `app.py`
         | 
| 53 | 
            +
            The main Gradio interface (already created in this repository)
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            #### `requirements_space.txt`
         | 
| 56 | 
            +
            ```
         | 
| 57 | 
            +
            gradio>=4.0.0
         | 
| 58 | 
            +
            gradio-client>=0.10.0
         | 
| 59 | 
            +
            requests>=2.31.0
         | 
| 60 | 
            +
            numpy>=1.24.0
         | 
| 61 | 
            +
            pandas>=2.0.0
         | 
| 62 | 
            +
            jsonschema>=4.17.0
         | 
| 63 | 
            +
            plotly>=5.15.0
         | 
| 64 | 
            +
            matplotlib>=3.7.0
         | 
| 65 | 
            +
            python-dotenv>=1.0.0
         | 
| 66 | 
            +
            ```
         | 
| 67 | 
            +
             | 
| 68 | 
            +
            #### `README.md`
         | 
| 69 | 
            +
            ```markdown
         | 
| 70 | 
            +
            # Trackio Experiment Tracking
         | 
| 71 | 
            +
             | 
| 72 | 
            +
            A Gradio interface for experiment tracking and monitoring.
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            ## Features
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            - Create and manage experiments
         | 
| 77 | 
            +
            - Log training metrics and parameters
         | 
| 78 | 
            +
            - View experiment details and results
         | 
| 79 | 
            +
            - Update experiment status
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            ## Usage
         | 
| 82 | 
            +
             | 
| 83 | 
            +
            1. Create a new experiment using the "Create Experiment" tab
         | 
| 84 | 
            +
            2. Log metrics during training using the "Log Metrics" tab
         | 
| 85 | 
            +
            3. View experiment details using the "View Experiments" tab
         | 
| 86 | 
            +
            4. Update experiment status using the "Update Status" tab
         | 
| 87 | 
            +
             | 
| 88 | 
            +
            ## Integration
         | 
| 89 | 
            +
             | 
| 90 | 
            +
            To connect your training script to this Trackio Space:
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            ```python
         | 
| 93 | 
            +
            from monitoring import SmolLM3Monitor
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            monitor = SmolLM3Monitor(
         | 
| 96 | 
            +
                experiment_name="my_experiment",
         | 
| 97 | 
            +
                trackio_url="https://your-space.hf.space",
         | 
| 98 | 
            +
                enable_tracking=True
         | 
| 99 | 
            +
            )
         | 
| 100 | 
            +
            ```
         | 
| 101 | 
            +
             | 
| 102 | 
            +
            ### Step 3: Configure Space Settings
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            In your Space settings, ensure:
         | 
| 105 | 
            +
            - **App file**: `app.py`
         | 
| 106 | 
            +
            - **Python version**: 3.9 or higher
         | 
| 107 | 
            +
            - **Hardware**: CPU (Basic) is sufficient
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            ## Integration with Your Training Script
         | 
| 110 | 
            +
             | 
| 111 | 
            +
            ### Step 1: Update Your Configuration
         | 
| 112 | 
            +
             | 
| 113 | 
            +
            Add Trackio settings to your training configuration:
         | 
| 114 | 
            +
             | 
| 115 | 
            +
            ```python
         | 
| 116 | 
            +
            # config/train_smollm3.py
         | 
| 117 | 
            +
            @dataclass
         | 
| 118 | 
            +
            class SmolLM3Config:
         | 
| 119 | 
            +
                # ... existing settings ...
         | 
| 120 | 
            +
                
         | 
| 121 | 
            +
                # Trackio monitoring configuration
         | 
| 122 | 
            +
                enable_tracking: bool = True
         | 
| 123 | 
            +
                trackio_url: Optional[str] = None  # Your Space URL
         | 
| 124 | 
            +
                trackio_token: Optional[str] = None
         | 
| 125 | 
            +
                log_artifacts: bool = True
         | 
| 126 | 
            +
                log_metrics: bool = True
         | 
| 127 | 
            +
                log_config: bool = True
         | 
| 128 | 
            +
                experiment_name: Optional[str] = None
         | 
| 129 | 
            +
            ```
         | 
| 130 | 
            +
             | 
| 131 | 
            +
            ### Step 2: Run Training with Trackio
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            ```bash
         | 
| 134 | 
            +
            python train.py config/train_smollm3.py \
         | 
| 135 | 
            +
                --dataset_dir my_dataset \
         | 
| 136 | 
            +
                --enable_tracking \
         | 
| 137 | 
            +
                --trackio_url "https://your-username-trackio-monitoring.hf.space" \
         | 
| 138 | 
            +
                --experiment_name "smollm3_finetune_v1"
         | 
| 139 | 
            +
            ```
         | 
| 140 | 
            +
             | 
| 141 | 
            +
            ### Step 3: Monitor Your Experiments
         | 
| 142 | 
            +
             | 
| 143 | 
            +
            1. **Create Experiment**: Use the "Create Experiment" tab in your Space
         | 
| 144 | 
            +
            2. **Log Metrics**: Your training script will automatically log metrics
         | 
| 145 | 
            +
            3. **View Results**: Use the "View Experiments" tab to see progress
         | 
| 146 | 
            +
            4. **Update Status**: Mark experiments as completed when done
         | 
| 147 | 
            +
             | 
| 148 | 
            +
            ## Advanced Configuration
         | 
| 149 | 
            +
             | 
| 150 | 
            +
            ### Environment Variables
         | 
| 151 | 
            +
             | 
| 152 | 
            +
            You can set Trackio configuration via environment variables:
         | 
| 153 | 
            +
             | 
| 154 | 
            +
            ```bash
         | 
| 155 | 
            +
            export TRACKIO_URL="https://your-space.hf.space"
         | 
| 156 | 
            +
            export TRACKIO_TOKEN="your_token_here"
         | 
| 157 | 
            +
            ```
         | 
| 158 | 
            +
             | 
| 159 | 
            +
            ### Custom Experiment Names
         | 
| 160 | 
            +
             | 
| 161 | 
            +
            ```bash
         | 
| 162 | 
            +
            python train.py config/train_smollm3.py \
         | 
| 163 | 
            +
                --experiment_name "smollm3_high_lr_experiment" \
         | 
| 164 | 
            +
                --trackio_url "https://your-space.hf.space"
         | 
| 165 | 
            +
            ```
         | 
| 166 | 
            +
             | 
| 167 | 
            +
            ### Multiple Experiments
         | 
| 168 | 
            +
             | 
| 169 | 
            +
            You can run multiple experiments and track them separately:
         | 
| 170 | 
            +
             | 
| 171 | 
            +
            ```bash
         | 
| 172 | 
            +
            # Experiment 1
         | 
| 173 | 
            +
            python train.py config/train_smollm3.py \
         | 
| 174 | 
            +
                --experiment_name "smollm3_baseline" \
         | 
| 175 | 
            +
                --learning_rate 2e-5
         | 
| 176 | 
            +
             | 
| 177 | 
            +
            # Experiment 2
         | 
| 178 | 
            +
            python train.py config/train_smollm3.py \
         | 
| 179 | 
            +
                --experiment_name "smollm3_high_lr" \
         | 
| 180 | 
            +
                --learning_rate 5e-5
         | 
| 181 | 
            +
            ```
         | 
| 182 | 
            +
             | 
| 183 | 
            +
            ## Using the Trackio Interface
         | 
| 184 | 
            +
             | 
| 185 | 
            +
            ### Creating Experiments
         | 
| 186 | 
            +
             | 
| 187 | 
            +
            1. Go to the "Create Experiment" tab
         | 
| 188 | 
            +
            2. Enter experiment name (e.g., "smollm3_finetune_v1")
         | 
| 189 | 
            +
            3. Add description (optional)
         | 
| 190 | 
            +
            4. Click "Create Experiment"
         | 
| 191 | 
            +
            5. Note the experiment ID for logging metrics
         | 
| 192 | 
            +
             | 
| 193 | 
            +
            ### Logging Metrics
         | 
| 194 | 
            +
             | 
| 195 | 
            +
            1. Go to the "Log Metrics" tab
         | 
| 196 | 
            +
            2. Enter your experiment ID
         | 
| 197 | 
            +
            3. Add metrics in JSON format:
         | 
| 198 | 
            +
               ```json
         | 
| 199 | 
            +
               {
         | 
| 200 | 
            +
                 "loss": 0.5,
         | 
| 201 | 
            +
                 "accuracy": 0.85,
         | 
| 202 | 
            +
                 "learning_rate": 2e-5
         | 
| 203 | 
            +
               }
         | 
| 204 | 
            +
               ```
         | 
| 205 | 
            +
            4. Add step number (optional)
         | 
| 206 | 
            +
            5. Click "Log Metrics"
         | 
| 207 | 
            +
             | 
| 208 | 
            +
            ### Viewing Experiments
         | 
| 209 | 
            +
             | 
| 210 | 
            +
            1. Go to the "View Experiments" tab
         | 
| 211 | 
            +
            2. Enter experiment ID to view specific experiment
         | 
| 212 | 
            +
            3. Or click "List All Experiments" to see all experiments
         | 
| 213 | 
            +
             | 
| 214 | 
            +
            ### Updating Status
         | 
| 215 | 
            +
             | 
| 216 | 
            +
            1. Go to the "Update Status" tab
         | 
| 217 | 
            +
            2. Enter experiment ID
         | 
| 218 | 
            +
            3. Select new status (running, completed, failed, paused)
         | 
| 219 | 
            +
            4. Click "Update Status"
         | 
| 220 | 
            +
             | 
| 221 | 
            +
            ## Troubleshooting
         | 
| 222 | 
            +
             | 
| 223 | 
            +
            ### Common Issues
         | 
| 224 | 
            +
             | 
| 225 | 
            +
            #### 1. Space Not Building
         | 
| 226 | 
            +
            - Check that all required files are uploaded
         | 
| 227 | 
            +
            - Verify `app.py` is the main file
         | 
| 228 | 
            +
            - Check the Space logs for errors
         | 
| 229 | 
            +
             | 
| 230 | 
            +
            #### 2. Connection Errors
         | 
| 231 | 
            +
            - Verify your Space URL is correct
         | 
| 232 | 
            +
            - Check that the Space is running (not paused)
         | 
| 233 | 
            +
            - Ensure your training script can reach the Space URL
         | 
| 234 | 
            +
             | 
| 235 | 
            +
            #### 3. Missing Metrics
         | 
| 236 | 
            +
            - Check that `enable_tracking=True` in your config
         | 
| 237 | 
            +
            - Verify the Trackio URL is correct
         | 
| 238 | 
            +
            - Check training logs for monitoring errors
         | 
| 239 | 
            +
             | 
| 240 | 
            +
            #### 4. Authentication Issues
         | 
| 241 | 
            +
            - If using tokens, verify they're correct
         | 
| 242 | 
            +
            - Check Hugging Face account permissions
         | 
| 243 | 
            +
            - Ensure Space is public or you have access
         | 
| 244 | 
            +
             | 
| 245 | 
            +
            ### Debug Mode
         | 
| 246 | 
            +
             | 
| 247 | 
            +
            Enable debug logging in your training script:
         | 
| 248 | 
            +
             | 
| 249 | 
            +
            ```python
         | 
| 250 | 
            +
            import logging
         | 
| 251 | 
            +
            logging.basicConfig(level=logging.DEBUG)
         | 
| 252 | 
            +
            ```
         | 
| 253 | 
            +
             | 
| 254 | 
            +
            ### Manual Testing
         | 
| 255 | 
            +
             | 
| 256 | 
            +
            Test the Trackio interface manually:
         | 
| 257 | 
            +
             | 
| 258 | 
            +
            1. Create an experiment
         | 
| 259 | 
            +
            2. Log some test metrics
         | 
| 260 | 
            +
            3. View the experiment details
         | 
| 261 | 
            +
            4. Update the status
         | 
| 262 | 
            +
             | 
| 263 | 
            +
            ## Security Considerations
         | 
| 264 | 
            +
             | 
| 265 | 
            +
            ### Public vs Private Spaces
         | 
| 266 | 
            +
             | 
| 267 | 
            +
            - **Public Spaces**: Anyone can view and use the interface
         | 
| 268 | 
            +
            - **Private Spaces**: Only you and collaborators can access
         | 
| 269 | 
            +
             | 
| 270 | 
            +
            ### Token Management
         | 
| 271 | 
            +
             | 
| 272 | 
            +
            - Store tokens securely (environment variables)
         | 
| 273 | 
            +
            - Don't commit tokens to version control
         | 
| 274 | 
            +
            - Use Hugging Face's token management
         | 
| 275 | 
            +
             | 
| 276 | 
            +
            ### Data Privacy
         | 
| 277 | 
            +
             | 
| 278 | 
            +
            - Trackio stores experiment data in the Space
         | 
| 279 | 
            +
            - Consider data retention policies
         | 
| 280 | 
            +
            - Be mindful of sensitive information in experiment names
         | 
| 281 | 
            +
             | 
| 282 | 
            +
            ## Performance Optimization
         | 
| 283 | 
            +
             | 
| 284 | 
            +
            ### Space Configuration
         | 
| 285 | 
            +
             | 
| 286 | 
            +
            - Use CPU (Basic) for the interface (sufficient for tracking)
         | 
| 287 | 
            +
            - Consider GPU only for actual training
         | 
| 288 | 
            +
            - Monitor Space usage and limits
         | 
| 289 | 
            +
             | 
| 290 | 
            +
            ### Efficient Logging
         | 
| 291 | 
            +
             | 
| 292 | 
            +
            - Log metrics at reasonable intervals (every 10-100 steps)
         | 
| 293 | 
            +
            - Avoid logging too frequently to prevent rate limiting
         | 
| 294 | 
            +
            - Use batch logging when possible
         | 
| 295 | 
            +
             | 
| 296 | 
            +
            ## Monitoring Best Practices
         | 
| 297 | 
            +
             | 
| 298 | 
            +
            ### Experiment Naming
         | 
| 299 | 
            +
             | 
| 300 | 
            +
            Use descriptive names:
         | 
| 301 | 
            +
            - `smollm3_baseline_v1`
         | 
| 302 | 
            +
            - `smollm3_high_lr_experiment`
         | 
| 303 | 
            +
            - `smollm3_dpo_training`
         | 
| 304 | 
            +
             | 
| 305 | 
            +
            ### Metric Logging
         | 
| 306 | 
            +
             | 
| 307 | 
            +
            Log relevant metrics:
         | 
| 308 | 
            +
            - Training loss
         | 
| 309 | 
            +
            - Validation loss
         | 
| 310 | 
            +
            - Learning rate
         | 
| 311 | 
            +
            - GPU memory usage
         | 
| 312 | 
            +
            - Training time
         | 
| 313 | 
            +
             | 
| 314 | 
            +
            ### Status Management
         | 
| 315 | 
            +
             | 
| 316 | 
            +
            - Mark experiments as "running" when starting
         | 
| 317 | 
            +
            - Update to "completed" when finished
         | 
| 318 | 
            +
            - Mark as "failed" if errors occur
         | 
| 319 | 
            +
            - Use "paused" for temporary stops
         | 
| 320 | 
            +
             | 
| 321 | 
            +
            ## Integration Examples
         | 
| 322 | 
            +
             | 
| 323 | 
            +
            ### Basic Integration
         | 
| 324 | 
            +
             | 
| 325 | 
            +
            ```python
         | 
| 326 | 
            +
            from monitoring import SmolLM3Monitor
         | 
| 327 | 
            +
             | 
| 328 | 
            +
            # Initialize monitor
         | 
| 329 | 
            +
            monitor = SmolLM3Monitor(
         | 
| 330 | 
            +
                experiment_name="my_experiment",
         | 
| 331 | 
            +
                trackio_url="https://your-space.hf.space",
         | 
| 332 | 
            +
                enable_tracking=True
         | 
| 333 | 
            +
            )
         | 
| 334 | 
            +
             | 
| 335 | 
            +
            # Log configuration
         | 
| 336 | 
            +
            monitor.log_config(config_dict)
         | 
| 337 | 
            +
             | 
| 338 | 
            +
            # Log metrics during training
         | 
| 339 | 
            +
            monitor.log_metrics({"loss": 0.5}, step=100)
         | 
| 340 | 
            +
             | 
| 341 | 
            +
            # Log final results
         | 
| 342 | 
            +
            monitor.log_training_summary(final_results)
         | 
| 343 | 
            +
            ```
         | 
| 344 | 
            +
             | 
| 345 | 
            +
            ### Advanced Integration
         | 
| 346 | 
            +
             | 
| 347 | 
            +
            ```python
         | 
| 348 | 
            +
            # Custom monitoring setup
         | 
| 349 | 
            +
            monitor = SmolLM3Monitor(
         | 
| 350 | 
            +
                experiment_name="smollm3_advanced",
         | 
| 351 | 
            +
                trackio_url="https://your-space.hf.space",
         | 
| 352 | 
            +
                enable_tracking=True,
         | 
| 353 | 
            +
                log_artifacts=True,
         | 
| 354 | 
            +
                log_metrics=True,
         | 
| 355 | 
            +
                log_config=True
         | 
| 356 | 
            +
            )
         | 
| 357 | 
            +
             | 
| 358 | 
            +
            # Log system metrics
         | 
| 359 | 
            +
            monitor.log_system_metrics(step=current_step)
         | 
| 360 | 
            +
             | 
| 361 | 
            +
            # Log model checkpoint
         | 
| 362 | 
            +
            monitor.log_model_checkpoint("checkpoint-1000", step=1000)
         | 
| 363 | 
            +
             | 
| 364 | 
            +
            # Log evaluation results
         | 
| 365 | 
            +
            monitor.log_evaluation_results(eval_results, step=1000)
         | 
| 366 | 
            +
            ```
         | 
| 367 | 
            +
             | 
| 368 | 
            +
            ## Support and Resources
         | 
| 369 | 
            +
             | 
| 370 | 
            +
            ### Documentation
         | 
| 371 | 
            +
             | 
| 372 | 
            +
            - [Hugging Face Spaces Documentation](https://huggingface.co/docs/hub/spaces)
         | 
| 373 | 
            +
            - [Gradio Documentation](https://gradio.app/docs/)
         | 
| 374 | 
            +
            - [Trackio GitHub Repository](https://github.com/Josephrp/trackio)
         | 
| 375 | 
            +
             | 
| 376 | 
            +
            ### Community
         | 
| 377 | 
            +
             | 
| 378 | 
            +
            - [Hugging Face Forums](https://discuss.huggingface.co/)
         | 
| 379 | 
            +
            - [Gradio Discord](https://discord.gg/feTf9z3Z)
         | 
| 380 | 
            +
             | 
| 381 | 
            +
            ### Issues and Feedback
         | 
| 382 | 
            +
             | 
| 383 | 
            +
            - Report issues on the project repository
         | 
| 384 | 
            +
            - Provide feedback on the Trackio interface
         | 
| 385 | 
            +
            - Suggest improvements for the monitoring system
         | 
| 386 | 
            +
             | 
| 387 | 
            +
            ## Conclusion
         | 
| 388 | 
            +
             | 
| 389 | 
            +
            You now have a complete Trackio monitoring system deployed on Hugging Face Spaces! This setup provides:
         | 
| 390 | 
            +
             | 
| 391 | 
            +
            - ✅ Easy experiment tracking and monitoring
         | 
| 392 | 
            +
            - ✅ Real-time metric logging
         | 
| 393 | 
            +
            - ✅ Web-based interface for experiment management
         | 
| 394 | 
            +
            - ✅ Integration with your SmolLM3 fine-tuning pipeline
         | 
| 395 | 
            +
            - ✅ Scalable and accessible monitoring solution
         | 
| 396 | 
            +
             | 
| 397 | 
            +
            Start tracking your experiments and gain insights into your model training process! 
         | 
    	
        PUSH_GUIDE.md
    ADDED
    
    | @@ -0,0 +1,406 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Push to Hugging Face Hub Guide
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            This guide explains how to use the `push_to_huggingface.py` script to upload your trained SmolLM3 models and results to Hugging Face Hub.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ## Features
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            - ✅ **Automatic Repository Creation** - Creates HF repositories automatically
         | 
| 8 | 
            +
            - ✅ **Model Validation** - Validates required model files before upload
         | 
| 9 | 
            +
            - ✅ **Comprehensive Model Cards** - Generates detailed model documentation
         | 
| 10 | 
            +
            - ✅ **Training Results Upload** - Uploads logs, configs, and results
         | 
| 11 | 
            +
            - ✅ **Trackio Integration** - Logs push actions to your monitoring system
         | 
| 12 | 
            +
            - ✅ **Private/Public Repositories** - Support for both private and public models
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            ## Prerequisites
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            ### 1. Install Dependencies
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            ```bash
         | 
| 19 | 
            +
            pip install huggingface_hub
         | 
| 20 | 
            +
            ```
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            ### 2. Set Up Hugging Face Token
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            ```bash
         | 
| 25 | 
            +
            # Option 1: Environment variable
         | 
| 26 | 
            +
            export HF_TOKEN="your_huggingface_token_here"
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            # Option 2: Use --token argument
         | 
| 29 | 
            +
            python push_to_huggingface.py model_path repo_name --token "your_token"
         | 
| 30 | 
            +
            ```
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            ### 3. Get Your Hugging Face Token
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            1. Go to https://huggingface.co/settings/tokens
         | 
| 35 | 
            +
            2. Click "New token"
         | 
| 36 | 
            +
            3. Give it a name (e.g., "model-upload")
         | 
| 37 | 
            +
            4. Select "Write" permissions
         | 
| 38 | 
            +
            5. Copy the token
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            ## Basic Usage
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            ### Simple Model Push
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            ```bash
         | 
| 45 | 
            +
            python push_to_huggingface.py /path/to/model username/model-name
         | 
| 46 | 
            +
            ```
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            ### Push with Custom Token
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            ```bash
         | 
| 51 | 
            +
            python push_to_huggingface.py /path/to/model username/model-name \
         | 
| 52 | 
            +
                --token "hf_your_token_here"
         | 
| 53 | 
            +
            ```
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            ### Push Private Model
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            ```bash
         | 
| 58 | 
            +
            python push_to_huggingface.py /path/to/model username/model-name \
         | 
| 59 | 
            +
                --private
         | 
| 60 | 
            +
            ```
         | 
| 61 | 
            +
             | 
| 62 | 
            +
            ### Push with Trackio Integration
         | 
| 63 | 
            +
             | 
| 64 | 
            +
            ```bash
         | 
| 65 | 
            +
            python push_to_huggingface.py /path/to/model username/model-name \
         | 
| 66 | 
            +
                --trackio-url "https://your-space.hf.space" \
         | 
| 67 | 
            +
                --experiment-name "my_experiment"
         | 
| 68 | 
            +
            ```
         | 
| 69 | 
            +
             | 
| 70 | 
            +
            ## Complete Workflow Example
         | 
| 71 | 
            +
             | 
| 72 | 
            +
            ### 1. Train Your Model
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            ```bash
         | 
| 75 | 
            +
            python train.py config/train_smollm3.py \
         | 
| 76 | 
            +
                --dataset_dir my_dataset \
         | 
| 77 | 
            +
                --enable_tracking \
         | 
| 78 | 
            +
                --trackio_url "https://your-space.hf.space" \
         | 
| 79 | 
            +
                --experiment_name "smollm3_finetune_v1"
         | 
| 80 | 
            +
            ```
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            ### 2. Push to Hugging Face Hub
         | 
| 83 | 
            +
             | 
| 84 | 
            +
            ```bash
         | 
| 85 | 
            +
            python push_to_huggingface.py /output-checkpoint username/smollm3-finetuned \
         | 
| 86 | 
            +
                --trackio-url "https://your-space.hf.space" \
         | 
| 87 | 
            +
                --experiment-name "smollm3_finetune_v1"
         | 
| 88 | 
            +
            ```
         | 
| 89 | 
            +
             | 
| 90 | 
            +
            ### 3. Use Your Model
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            ```python
         | 
| 93 | 
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            # Load your uploaded model
         | 
| 96 | 
            +
            model = AutoModelForCausalLM.from_pretrained("username/smollm3-finetuned")
         | 
| 97 | 
            +
            tokenizer = AutoTokenizer.from_pretrained("username/smollm3-finetuned")
         | 
| 98 | 
            +
             | 
| 99 | 
            +
            # Generate text
         | 
| 100 | 
            +
            inputs = tokenizer("Hello, how are you?", return_tensors="pt")
         | 
| 101 | 
            +
            outputs = model.generate(**inputs, max_new_tokens=100)
         | 
| 102 | 
            +
            print(tokenizer.decode(outputs[0], skip_special_tokens=True))
         | 
| 103 | 
            +
            ```
         | 
| 104 | 
            +
             | 
| 105 | 
            +
            ## Repository Structure
         | 
| 106 | 
            +
             | 
| 107 | 
            +
            After pushing, your repository will contain:
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            ```
         | 
| 110 | 
            +
            username/model-name/
         | 
| 111 | 
            +
            ├── README.md                    # Auto-generated model card
         | 
| 112 | 
            +
            ├── config.json                  # Model configuration
         | 
| 113 | 
            +
            ├── pytorch_model.bin           # Model weights
         | 
| 114 | 
            +
            ├── tokenizer.json              # Tokenizer configuration
         | 
| 115 | 
            +
            ├── tokenizer_config.json       # Tokenizer settings
         | 
| 116 | 
            +
            ├── special_tokens_map.json     # Special tokens
         | 
| 117 | 
            +
            ├── training_results/           # Training artifacts
         | 
| 118 | 
            +
            │   ├── train_results.json
         | 
| 119 | 
            +
            │   ├── eval_results.json
         | 
| 120 | 
            +
            │   ├── training_config.json
         | 
| 121 | 
            +
            │   └── training.log
         | 
| 122 | 
            +
            └── .gitattributes             # Git attributes
         | 
| 123 | 
            +
            ```
         | 
| 124 | 
            +
             | 
| 125 | 
            +
            ## Model Card Features
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            The script automatically generates comprehensive model cards including:
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            - **Model Details**: Base model, fine-tuning method, size
         | 
| 130 | 
            +
            - **Training Configuration**: All training parameters
         | 
| 131 | 
            +
            - **Training Results**: Loss, accuracy, steps, time
         | 
| 132 | 
            +
            - **Usage Examples**: Code snippets for loading and using
         | 
| 133 | 
            +
            - **Performance Metrics**: Training and validation metrics
         | 
| 134 | 
            +
            - **Hardware Information**: GPU/CPU used for training
         | 
| 135 | 
            +
             | 
| 136 | 
            +
            ## Advanced Usage
         | 
| 137 | 
            +
             | 
| 138 | 
            +
            ### Custom Repository Names
         | 
| 139 | 
            +
             | 
| 140 | 
            +
            ```bash
         | 
| 141 | 
            +
            # Public repository
         | 
| 142 | 
            +
            python push_to_huggingface.py /model myusername/smollm3-chatbot
         | 
| 143 | 
            +
             | 
| 144 | 
            +
            # Private repository
         | 
| 145 | 
            +
            python push_to_huggingface.py /model myusername/smollm3-private --private
         | 
| 146 | 
            +
            ```
         | 
| 147 | 
            +
             | 
| 148 | 
            +
            ### Integration with Training Pipeline
         | 
| 149 | 
            +
             | 
| 150 | 
            +
            ```bash
         | 
| 151 | 
            +
            #!/bin/bash
         | 
| 152 | 
            +
            # Complete training and push workflow
         | 
| 153 | 
            +
             | 
| 154 | 
            +
            # 1. Train the model
         | 
| 155 | 
            +
            python train.py config/train_smollm3.py \
         | 
| 156 | 
            +
                --dataset_dir my_dataset \
         | 
| 157 | 
            +
                --enable_tracking \
         | 
| 158 | 
            +
                --trackio_url "https://your-space.hf.space" \
         | 
| 159 | 
            +
                --experiment_name "smollm3_v1"
         | 
| 160 | 
            +
             | 
| 161 | 
            +
            # 2. Push to Hugging Face Hub
         | 
| 162 | 
            +
            python push_to_huggingface.py /output-checkpoint myusername/smollm3-v1 \
         | 
| 163 | 
            +
                --trackio-url "https://your-space.hf.space" \
         | 
| 164 | 
            +
                --experiment-name "smollm3_v1"
         | 
| 165 | 
            +
             | 
| 166 | 
            +
            # 3. Test the model
         | 
| 167 | 
            +
            python -c "
         | 
| 168 | 
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         | 
| 169 | 
            +
            model = AutoModelForCausalLM.from_pretrained('myusername/smollm3-v1')
         | 
| 170 | 
            +
            tokenizer = AutoTokenizer.from_pretrained('myusername/smollm3-v1')
         | 
| 171 | 
            +
            print('Model loaded successfully!')
         | 
| 172 | 
            +
            "
         | 
| 173 | 
            +
            ```
         | 
| 174 | 
            +
             | 
| 175 | 
            +
            ### Batch Processing Multiple Models
         | 
| 176 | 
            +
             | 
| 177 | 
            +
            ```bash
         | 
| 178 | 
            +
            #!/bin/bash
         | 
| 179 | 
            +
            # Push multiple models
         | 
| 180 | 
            +
             | 
| 181 | 
            +
            models=(
         | 
| 182 | 
            +
                "smollm3-baseline"
         | 
| 183 | 
            +
                "smollm3-high-lr"
         | 
| 184 | 
            +
                "smollm3-dpo"
         | 
| 185 | 
            +
            )
         | 
| 186 | 
            +
             | 
| 187 | 
            +
            for model in "${models[@]}"; do
         | 
| 188 | 
            +
                echo "Pushing $model..."
         | 
| 189 | 
            +
                python push_to_huggingface.py "/models/$model" "username/$model"
         | 
| 190 | 
            +
            done
         | 
| 191 | 
            +
            ```
         | 
| 192 | 
            +
             | 
| 193 | 
            +
            ## Error Handling
         | 
| 194 | 
            +
             | 
| 195 | 
            +
            ### Common Issues and Solutions
         | 
| 196 | 
            +
             | 
| 197 | 
            +
            #### 1. Missing Model Files
         | 
| 198 | 
            +
             | 
| 199 | 
            +
            **Error**: `❌ Missing required files: ['config.json', 'pytorch_model.bin']`
         | 
| 200 | 
            +
             | 
| 201 | 
            +
            **Solution**: Ensure your model directory contains all required files:
         | 
| 202 | 
            +
            - `config.json`
         | 
| 203 | 
            +
            - `pytorch_model.bin`
         | 
| 204 | 
            +
            - `tokenizer.json`
         | 
| 205 | 
            +
            - `tokenizer_config.json`
         | 
| 206 | 
            +
             | 
| 207 | 
            +
            #### 2. Authentication Issues
         | 
| 208 | 
            +
             | 
| 209 | 
            +
            **Error**: `❌ Failed to create repository: 401 Client Error`
         | 
| 210 | 
            +
             | 
| 211 | 
            +
            **Solution**: 
         | 
| 212 | 
            +
            - Check your HF token is valid
         | 
| 213 | 
            +
            - Ensure token has write permissions
         | 
| 214 | 
            +
            - Verify username in repository name matches your account
         | 
| 215 | 
            +
             | 
| 216 | 
            +
            #### 3. Repository Already Exists
         | 
| 217 | 
            +
             | 
| 218 | 
            +
            **Error**: `Repository already exists`
         | 
| 219 | 
            +
             | 
| 220 | 
            +
            **Solution**: The script handles this automatically with `exist_ok=True`, but you can:
         | 
| 221 | 
            +
            - Use a different repository name
         | 
| 222 | 
            +
            - Delete the existing repository first
         | 
| 223 | 
            +
            - Use version numbers: `username/model-v2`
         | 
| 224 | 
            +
             | 
| 225 | 
            +
            #### 4. Large File Upload Issues
         | 
| 226 | 
            +
             | 
| 227 | 
            +
            **Error**: `Upload failed for large files`
         | 
| 228 | 
            +
             | 
| 229 | 
            +
            **Solution**:
         | 
| 230 | 
            +
            - Check your internet connection
         | 
| 231 | 
            +
            - Use Git LFS for large files
         | 
| 232 | 
            +
            - Consider splitting large models
         | 
| 233 | 
            +
             | 
| 234 | 
            +
            ## Trackio Integration
         | 
| 235 | 
            +
             | 
| 236 | 
            +
            ### Logging Push Actions
         | 
| 237 | 
            +
             | 
| 238 | 
            +
            When using Trackio integration, the script logs:
         | 
| 239 | 
            +
             | 
| 240 | 
            +
            - **Push Action**: Repository creation and file uploads
         | 
| 241 | 
            +
            - **Model Metadata**: Size, configuration, results
         | 
| 242 | 
            +
            - **Repository Info**: Name, privacy settings, URL
         | 
| 243 | 
            +
            - **Training Results**: Loss, accuracy, steps
         | 
| 244 | 
            +
             | 
| 245 | 
            +
            ### Viewing Push Logs
         | 
| 246 | 
            +
             | 
| 247 | 
            +
            1. Go to your Trackio Space
         | 
| 248 | 
            +
            2. Navigate to the "View Experiments" tab
         | 
| 249 | 
            +
            3. Find your experiment
         | 
| 250 | 
            +
            4. Check the metrics for push-related actions
         | 
| 251 | 
            +
             | 
| 252 | 
            +
            ## Security Best Practices
         | 
| 253 | 
            +
             | 
| 254 | 
            +
            ### Token Management
         | 
| 255 | 
            +
             | 
| 256 | 
            +
            ```bash
         | 
| 257 | 
            +
            # Use environment variables (recommended)
         | 
| 258 | 
            +
            export HF_TOKEN="your_token_here"
         | 
| 259 | 
            +
            python push_to_huggingface.py model repo
         | 
| 260 | 
            +
             | 
| 261 | 
            +
            # Don't hardcode tokens in scripts
         | 
| 262 | 
            +
            # ❌ Bad: python push_to_huggingface.py model repo --token "hf_xxx"
         | 
| 263 | 
            +
            ```
         | 
| 264 | 
            +
             | 
| 265 | 
            +
            ### Private Models
         | 
| 266 | 
            +
             | 
| 267 | 
            +
            ```bash
         | 
| 268 | 
            +
            # For sensitive models, use private repositories
         | 
| 269 | 
            +
            python push_to_huggingface.py model username/private-model --private
         | 
| 270 | 
            +
            ```
         | 
| 271 | 
            +
             | 
| 272 | 
            +
            ### Repository Naming
         | 
| 273 | 
            +
             | 
| 274 | 
            +
            ```bash
         | 
| 275 | 
            +
            # Use descriptive names
         | 
| 276 | 
            +
            python push_to_huggingface.py model username/smollm3-chatbot-v1
         | 
| 277 | 
            +
             | 
| 278 | 
            +
            # Include version numbers
         | 
| 279 | 
            +
            python push_to_huggingface.py model username/smollm3-v2.0
         | 
| 280 | 
            +
            ```
         | 
| 281 | 
            +
             | 
| 282 | 
            +
            ## Performance Optimization
         | 
| 283 | 
            +
             | 
| 284 | 
            +
            ### Large Models
         | 
| 285 | 
            +
             | 
| 286 | 
            +
            For models > 5GB:
         | 
| 287 | 
            +
             | 
| 288 | 
            +
            ```bash
         | 
| 289 | 
            +
            # Use Git LFS for large files
         | 
| 290 | 
            +
            git lfs install
         | 
| 291 | 
            +
            git lfs track "*.bin"
         | 
| 292 | 
            +
             | 
| 293 | 
            +
            # Consider splitting models
         | 
| 294 | 
            +
            python push_to_huggingface.py model username/model-large --private
         | 
| 295 | 
            +
            ```
         | 
| 296 | 
            +
             | 
| 297 | 
            +
            ### Upload Speed
         | 
| 298 | 
            +
             | 
| 299 | 
            +
            ```bash
         | 
| 300 | 
            +
            # Use stable internet connection
         | 
| 301 | 
            +
            # Consider uploading during off-peak hours
         | 
| 302 | 
            +
            # Use private repositories for faster uploads
         | 
| 303 | 
            +
            ```
         | 
| 304 | 
            +
             | 
| 305 | 
            +
            ## Troubleshooting
         | 
| 306 | 
            +
             | 
| 307 | 
            +
            ### Debug Mode
         | 
| 308 | 
            +
             | 
| 309 | 
            +
            ```bash
         | 
| 310 | 
            +
            # Enable debug logging
         | 
| 311 | 
            +
            export LOG_LEVEL=DEBUG
         | 
| 312 | 
            +
            python push_to_huggingface.py model repo
         | 
| 313 | 
            +
            ```
         | 
| 314 | 
            +
             | 
| 315 | 
            +
            ### Validate Model Files
         | 
| 316 | 
            +
             | 
| 317 | 
            +
            ```bash
         | 
| 318 | 
            +
            # Check model structure before pushing
         | 
| 319 | 
            +
            ls -la /path/to/model/
         | 
| 320 | 
            +
            # Should contain: config.json, pytorch_model.bin, tokenizer.json, etc.
         | 
| 321 | 
            +
            ```
         | 
| 322 | 
            +
             | 
| 323 | 
            +
            ### Test Repository Access
         | 
| 324 | 
            +
             | 
| 325 | 
            +
            ```bash
         | 
| 326 | 
            +
            # Test your HF token
         | 
| 327 | 
            +
            python -c "
         | 
| 328 | 
            +
            from huggingface_hub import HfApi
         | 
| 329 | 
            +
            api = HfApi(token='your_token')
         | 
| 330 | 
            +
            print('Token is valid!')
         | 
| 331 | 
            +
            "
         | 
| 332 | 
            +
            ```
         | 
| 333 | 
            +
             | 
| 334 | 
            +
            ## Integration Examples
         | 
| 335 | 
            +
             | 
| 336 | 
            +
            ### With CI/CD Pipeline
         | 
| 337 | 
            +
             | 
| 338 | 
            +
            ```yaml
         | 
| 339 | 
            +
            # .github/workflows/train-and-push.yml
         | 
| 340 | 
            +
            name: Train and Push Model
         | 
| 341 | 
            +
             | 
| 342 | 
            +
            on:
         | 
| 343 | 
            +
              push:
         | 
| 344 | 
            +
                branches: [main]
         | 
| 345 | 
            +
             | 
| 346 | 
            +
            jobs:
         | 
| 347 | 
            +
              train-and-push:
         | 
| 348 | 
            +
                runs-on: ubuntu-latest
         | 
| 349 | 
            +
                steps:
         | 
| 350 | 
            +
                  - uses: actions/checkout@v2
         | 
| 351 | 
            +
                  
         | 
| 352 | 
            +
                  - name: Train Model
         | 
| 353 | 
            +
                    run: |
         | 
| 354 | 
            +
                      python train.py config/train_smollm3.py
         | 
| 355 | 
            +
                  
         | 
| 356 | 
            +
                  - name: Push to HF Hub
         | 
| 357 | 
            +
                    run: |
         | 
| 358 | 
            +
                      python push_to_huggingface.py /output username/model-${{ github.run_number }}
         | 
| 359 | 
            +
                    env:
         | 
| 360 | 
            +
                      HF_TOKEN: ${{ secrets.HF_TOKEN }}
         | 
| 361 | 
            +
            ```
         | 
| 362 | 
            +
             | 
| 363 | 
            +
            ### With Docker
         | 
| 364 | 
            +
             | 
| 365 | 
            +
            ```dockerfile
         | 
| 366 | 
            +
            # Dockerfile
         | 
| 367 | 
            +
            FROM python:3.9
         | 
| 368 | 
            +
             | 
| 369 | 
            +
            WORKDIR /app
         | 
| 370 | 
            +
            COPY requirements.txt .
         | 
| 371 | 
            +
            RUN pip install -r requirements.txt
         | 
| 372 | 
            +
             | 
| 373 | 
            +
            COPY . .
         | 
| 374 | 
            +
             | 
| 375 | 
            +
            CMD ["python", "push_to_huggingface.py", "/model", "username/model"]
         | 
| 376 | 
            +
            ```
         | 
| 377 | 
            +
             | 
| 378 | 
            +
            ## Support and Resources
         | 
| 379 | 
            +
             | 
| 380 | 
            +
            ### Documentation
         | 
| 381 | 
            +
             | 
| 382 | 
            +
            - [Hugging Face Hub Documentation](https://huggingface.co/docs/hub/index)
         | 
| 383 | 
            +
            - [Transformers Documentation](https://huggingface.co/docs/transformers/index)
         | 
| 384 | 
            +
            - [Model Cards Guide](https://huggingface.co/docs/hub/model-cards)
         | 
| 385 | 
            +
             | 
| 386 | 
            +
            ### Community
         | 
| 387 | 
            +
             | 
| 388 | 
            +
            - [Hugging Face Forums](https://discuss.huggingface.co/)
         | 
| 389 | 
            +
            - [GitHub Issues](https://github.com/huggingface/huggingface_hub/issues)
         | 
| 390 | 
            +
             | 
| 391 | 
            +
            ### Examples
         | 
| 392 | 
            +
             | 
| 393 | 
            +
            - [Model Repository Examples](https://huggingface.co/models?search=smollm3)
         | 
| 394 | 
            +
            - [Fine-tuned Models](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads)
         | 
| 395 | 
            +
             | 
| 396 | 
            +
            ## Conclusion
         | 
| 397 | 
            +
             | 
| 398 | 
            +
            The `push_to_huggingface.py` script provides a complete solution for:
         | 
| 399 | 
            +
             | 
| 400 | 
            +
            - ✅ **Easy Model Deployment** - One command to push models
         | 
| 401 | 
            +
            - ✅ **Professional Documentation** - Auto-generated model cards
         | 
| 402 | 
            +
            - ✅ **Training Artifacts** - Complete experiment tracking
         | 
| 403 | 
            +
            - ✅ **Integration Ready** - Works with CI/CD and monitoring
         | 
| 404 | 
            +
            - ✅ **Security Focused** - Proper token and privacy management
         | 
| 405 | 
            +
             | 
| 406 | 
            +
            Start sharing your fine-tuned SmolLM3 models with the community! 
         | 
    	
        README.md
    CHANGED
    
    | @@ -288,4 +288,17 @@ python -m llama_cpp.convert_model ./output-checkpoint --outfile model.gguf | |
| 288 |  | 
| 289 | 
             
            ## License
         | 
| 290 |  | 
| 291 | 
            -
            This project follows the same license as the SmolLM3 model. Please refer to the Hugging Face model page for licensing information. 
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 288 |  | 
| 289 | 
             
            ## License
         | 
| 290 |  | 
| 291 | 
            +
            This project follows the same license as the SmolLM3 model. Please refer to the Hugging Face model page for licensing information. 
         | 
| 292 | 
            +
             | 
| 293 | 
            +
             | 
| 294 | 
            +
            {
         | 
| 295 | 
            +
              "id": "exp_20250718_195852",
         | 
| 296 | 
            +
              "name": "petit-elle-l-aime-3",
         | 
| 297 | 
            +
              "description": "SmolLM3 fine-tuning experiment",
         | 
| 298 | 
            +
              "created_at": "2025-07-18T19:58:52.689087",
         | 
| 299 | 
            +
              "status": "running",
         | 
| 300 | 
            +
              "metrics": [],
         | 
| 301 | 
            +
              "parameters": {},
         | 
| 302 | 
            +
              "artifacts": [],
         | 
| 303 | 
            +
              "logs": []
         | 
| 304 | 
            +
            }
         | 
    	
        TRACKIO_INTEGRATION.md
    ADDED
    
    | @@ -0,0 +1,252 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Trackio Integration for SmolLM3 Fine-tuning
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            This document provides comprehensive information about the Trackio experiment tracking and monitoring integration for your SmolLM3 fine-tuning pipeline.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ## Features
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            - **SmolLM3 Fine-tuning**: Support for supervised fine-tuning and DPO training
         | 
| 8 | 
            +
            - **Trackio Integration**: Complete experiment tracking and monitoring
         | 
| 9 | 
            +
            - **Hugging Face Spaces Deployment**: Easy deployment of Trackio monitoring interface
         | 
| 10 | 
            +
            - **Comprehensive Logging**: Metrics, parameters, artifacts, and system monitoring
         | 
| 11 | 
            +
            - **Flexible Configuration**: Support for various training configurations
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            ## Quick Start
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            ### 1. Install Dependencies
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            ```bash
         | 
| 18 | 
            +
            pip install -r requirements.txt
         | 
| 19 | 
            +
            ```
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            ### 2. Basic Training with Trackio
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            ```bash
         | 
| 24 | 
            +
            python train.py config/train_smollm3.py \
         | 
| 25 | 
            +
                --dataset_dir my_dataset \
         | 
| 26 | 
            +
                --enable_tracking \
         | 
| 27 | 
            +
                --trackio_url "https://your-trackio-instance.com" \
         | 
| 28 | 
            +
                --experiment_name "smollm3_finetune_v1"
         | 
| 29 | 
            +
            ```
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            ### 3. Training with Custom Parameters
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            ```bash
         | 
| 34 | 
            +
            python train.py config/train_smollm3.py \
         | 
| 35 | 
            +
                --dataset_dir my_dataset \
         | 
| 36 | 
            +
                --batch_size 8 \
         | 
| 37 | 
            +
                --learning_rate 1e-5 \
         | 
| 38 | 
            +
                --max_iters 2000 \
         | 
| 39 | 
            +
                --enable_tracking \
         | 
| 40 | 
            +
                --trackio_url "https://your-trackio-instance.com" \
         | 
| 41 | 
            +
                --experiment_name "smollm3_high_lr_experiment"
         | 
| 42 | 
            +
            ```
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            ## Trackio Integration
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            ### Configuration
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            Add Trackio settings to your configuration:
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            ```python
         | 
| 51 | 
            +
            # In your config file
         | 
| 52 | 
            +
            config = SmolLM3Config(
         | 
| 53 | 
            +
                # ... other settings ...
         | 
| 54 | 
            +
                
         | 
| 55 | 
            +
                # Trackio monitoring configuration
         | 
| 56 | 
            +
                enable_tracking=True,
         | 
| 57 | 
            +
                trackio_url="https://your-trackio-instance.com",
         | 
| 58 | 
            +
                trackio_token="your_token_here",  # Optional
         | 
| 59 | 
            +
                log_artifacts=True,
         | 
| 60 | 
            +
                log_metrics=True,
         | 
| 61 | 
            +
                log_config=True,
         | 
| 62 | 
            +
                experiment_name="my_experiment"
         | 
| 63 | 
            +
            )
         | 
| 64 | 
            +
            ```
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            ### Environment Variables
         | 
| 67 | 
            +
             | 
| 68 | 
            +
            You can also set Trackio configuration via environment variables:
         | 
| 69 | 
            +
             | 
| 70 | 
            +
            ```bash
         | 
| 71 | 
            +
            export TRACKIO_URL="https://your-trackio-instance.com"
         | 
| 72 | 
            +
            export TRACKIO_TOKEN="your_token_here"
         | 
| 73 | 
            +
            ```
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            ### What Gets Tracked
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            - **Configuration**: All training parameters and model settings
         | 
| 78 | 
            +
            - **Metrics**: Loss, accuracy, learning rate, and custom metrics
         | 
| 79 | 
            +
            - **System Metrics**: GPU memory, CPU usage, training time
         | 
| 80 | 
            +
            - **Artifacts**: Model checkpoints, evaluation results
         | 
| 81 | 
            +
            - **Training Summary**: Final results and experiment duration
         | 
| 82 | 
            +
             | 
| 83 | 
            +
            ## Hugging Face Spaces Deployment
         | 
| 84 | 
            +
             | 
| 85 | 
            +
            ### Deploy Trackio Monitoring Interface
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            1. **Create a new Space** on Hugging Face:
         | 
| 88 | 
            +
               - Go to https://huggingface.co/spaces
         | 
| 89 | 
            +
               - Click "Create new Space"
         | 
| 90 | 
            +
               - Choose "Gradio" as the SDK
         | 
| 91 | 
            +
               - Set visibility (Public or Private)
         | 
| 92 | 
            +
             | 
| 93 | 
            +
            2. **Upload the deployment files**:
         | 
| 94 | 
            +
               - `app.py` - The Gradio interface
         | 
| 95 | 
            +
               - `requirements_space.txt` - Dependencies
         | 
| 96 | 
            +
               - `README.md` - Documentation
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            3. **Configure the Space**:
         | 
| 99 | 
            +
               - The Space will automatically install dependencies
         | 
| 100 | 
            +
               - The Gradio interface will be available at your Space URL
         | 
| 101 | 
            +
             | 
| 102 | 
            +
            ### Using the Trackio Space
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            1. **Create Experiments**: Use the "Create Experiment" tab to start new experiments
         | 
| 105 | 
            +
            2. **Log Metrics**: Use the "Log Metrics" tab to track training progress
         | 
| 106 | 
            +
            3. **View Results**: Use the "View Experiments" tab to see experiment details
         | 
| 107 | 
            +
            4. **Update Status**: Use the "Update Status" tab to mark experiments as completed
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            ### Integration with Your Training
         | 
| 110 | 
            +
             | 
| 111 | 
            +
            To connect your training script to the Trackio Space:
         | 
| 112 | 
            +
             | 
| 113 | 
            +
            ```python
         | 
| 114 | 
            +
            # In your training script
         | 
| 115 | 
            +
            from monitoring import SmolLM3Monitor
         | 
| 116 | 
            +
             | 
| 117 | 
            +
            # Initialize monitor
         | 
| 118 | 
            +
            monitor = SmolLM3Monitor(
         | 
| 119 | 
            +
                experiment_name="my_experiment",
         | 
| 120 | 
            +
                trackio_url="https://your-space.hf.space",  # Your Space URL
         | 
| 121 | 
            +
                enable_tracking=True
         | 
| 122 | 
            +
            )
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            # Log configuration
         | 
| 125 | 
            +
            monitor.log_config(config_dict)
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            # Log metrics during training
         | 
| 128 | 
            +
            monitor.log_metrics({"loss": 0.5, "accuracy": 0.85}, step=100)
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            # Log final results
         | 
| 131 | 
            +
            monitor.log_training_summary(final_results)
         | 
| 132 | 
            +
            ```
         | 
| 133 | 
            +
             | 
| 134 | 
            +
            ## Configuration Files
         | 
| 135 | 
            +
             | 
| 136 | 
            +
            ### Main Configuration (`config/train_smollm3.py`)
         | 
| 137 | 
            +
             | 
| 138 | 
            +
            ```python
         | 
| 139 | 
            +
            @dataclass
         | 
| 140 | 
            +
            class SmolLM3Config:
         | 
| 141 | 
            +
                # Model configuration
         | 
| 142 | 
            +
                model_name: str = "HuggingFaceTB/SmolLM3-3B"
         | 
| 143 | 
            +
                max_seq_length: int = 4096
         | 
| 144 | 
            +
                
         | 
| 145 | 
            +
                # Training configuration
         | 
| 146 | 
            +
                batch_size: int = 4
         | 
| 147 | 
            +
                learning_rate: float = 2e-5
         | 
| 148 | 
            +
                max_iters: int = 1000
         | 
| 149 | 
            +
                
         | 
| 150 | 
            +
                # Trackio monitoring
         | 
| 151 | 
            +
                enable_tracking: bool = True
         | 
| 152 | 
            +
                trackio_url: Optional[str] = None
         | 
| 153 | 
            +
                trackio_token: Optional[str] = None
         | 
| 154 | 
            +
                experiment_name: Optional[str] = None
         | 
| 155 | 
            +
            ```
         | 
| 156 | 
            +
             | 
| 157 | 
            +
            ### DPO Configuration (`config/train_smollm3_dpo.py`)
         | 
| 158 | 
            +
             | 
| 159 | 
            +
            ```python
         | 
| 160 | 
            +
            @dataclass
         | 
| 161 | 
            +
            class SmolLM3DPOConfig(SmolLM3Config):
         | 
| 162 | 
            +
                # DPO-specific settings
         | 
| 163 | 
            +
                beta: float = 0.1
         | 
| 164 | 
            +
                max_prompt_length: int = 2048
         | 
| 165 | 
            +
                
         | 
| 166 | 
            +
                # Trackio monitoring (inherited)
         | 
| 167 | 
            +
                enable_tracking: bool = True
         | 
| 168 | 
            +
                trackio_url: Optional[str] = None
         | 
| 169 | 
            +
            ```
         | 
| 170 | 
            +
             | 
| 171 | 
            +
            ## Monitoring Features
         | 
| 172 | 
            +
             | 
| 173 | 
            +
            ### Real-time Metrics
         | 
| 174 | 
            +
             | 
| 175 | 
            +
            - Training loss and evaluation metrics
         | 
| 176 | 
            +
            - Learning rate scheduling
         | 
| 177 | 
            +
            - GPU memory and utilization
         | 
| 178 | 
            +
            - Training time and progress
         | 
| 179 | 
            +
             | 
| 180 | 
            +
            ### Artifact Tracking
         | 
| 181 | 
            +
             | 
| 182 | 
            +
            - Model checkpoints at regular intervals
         | 
| 183 | 
            +
            - Evaluation results and plots
         | 
| 184 | 
            +
            - Configuration snapshots
         | 
| 185 | 
            +
            - Training logs and summaries
         | 
| 186 | 
            +
             | 
| 187 | 
            +
            ### Experiment Management
         | 
| 188 | 
            +
             | 
| 189 | 
            +
            - Experiment naming and organization
         | 
| 190 | 
            +
            - Status tracking (running, completed, failed)
         | 
| 191 | 
            +
            - Parameter comparison across experiments
         | 
| 192 | 
            +
            - Result visualization
         | 
| 193 | 
            +
             | 
| 194 | 
            +
            ## Advanced Usage
         | 
| 195 | 
            +
             | 
| 196 | 
            +
            ### Custom Metrics
         | 
| 197 | 
            +
             | 
| 198 | 
            +
            ```python
         | 
| 199 | 
            +
            # Log custom metrics
         | 
| 200 | 
            +
            monitor.log_metrics({
         | 
| 201 | 
            +
                "custom_metric": value,
         | 
| 202 | 
            +
                "perplexity": perplexity_score,
         | 
| 203 | 
            +
                "bleu_score": bleu_score
         | 
| 204 | 
            +
            }, step=current_step)
         | 
| 205 | 
            +
            ```
         | 
| 206 | 
            +
             | 
| 207 | 
            +
            ### System Monitoring
         | 
| 208 | 
            +
             | 
| 209 | 
            +
            ```python
         | 
| 210 | 
            +
            # Log system metrics
         | 
| 211 | 
            +
            monitor.log_system_metrics(step=current_step)
         | 
| 212 | 
            +
            ```
         | 
| 213 | 
            +
             | 
| 214 | 
            +
            ### Artifact Logging
         | 
| 215 | 
            +
             | 
| 216 | 
            +
            ```python
         | 
| 217 | 
            +
            # Log model checkpoint
         | 
| 218 | 
            +
            monitor.log_model_checkpoint("checkpoint-1000", step=1000)
         | 
| 219 | 
            +
             | 
| 220 | 
            +
            # Log evaluation results
         | 
| 221 | 
            +
            monitor.log_evaluation_results(eval_results, step=1000)
         | 
| 222 | 
            +
            ```
         | 
| 223 | 
            +
             | 
| 224 | 
            +
            ## Troubleshooting
         | 
| 225 | 
            +
             | 
| 226 | 
            +
            ### Common Issues
         | 
| 227 | 
            +
             | 
| 228 | 
            +
            1. **Trackio not available**: Install with `pip install trackio`
         | 
| 229 | 
            +
            2. **Connection errors**: Check your Trackio URL and token
         | 
| 230 | 
            +
            3. **Missing metrics**: Ensure monitoring is enabled in configuration
         | 
| 231 | 
            +
            4. **Space deployment issues**: Check Gradio version compatibility
         | 
| 232 | 
            +
             | 
| 233 | 
            +
            ### Debug Mode
         | 
| 234 | 
            +
             | 
| 235 | 
            +
            Enable debug logging:
         | 
| 236 | 
            +
             | 
| 237 | 
            +
            ```python
         | 
| 238 | 
            +
            import logging
         | 
| 239 | 
            +
            logging.basicConfig(level=logging.DEBUG)
         | 
| 240 | 
            +
            ```
         | 
| 241 | 
            +
             | 
| 242 | 
            +
            ## Contributing
         | 
| 243 | 
            +
             | 
| 244 | 
            +
            1. Fork the repository
         | 
| 245 | 
            +
            2. Create a feature branch
         | 
| 246 | 
            +
            3. Make your changes
         | 
| 247 | 
            +
            4. Add tests if applicable
         | 
| 248 | 
            +
            5. Submit a pull request
         | 
| 249 | 
            +
             | 
| 250 | 
            +
            ## License
         | 
| 251 | 
            +
             | 
| 252 | 
            +
            This project is licensed under the MIT License - see the LICENSE file for details. 
         | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,318 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            Trackio Deployment on Hugging Face Spaces
         | 
| 3 | 
            +
            A Gradio interface for experiment tracking and monitoring
         | 
| 4 | 
            +
            """
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import gradio as gr
         | 
| 7 | 
            +
            import os
         | 
| 8 | 
            +
            import json
         | 
| 9 | 
            +
            import logging
         | 
| 10 | 
            +
            from datetime import datetime
         | 
| 11 | 
            +
            from typing import Dict, Any, Optional
         | 
| 12 | 
            +
            import requests
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            # Setup logging
         | 
| 15 | 
            +
            logging.basicConfig(level=logging.INFO)
         | 
| 16 | 
            +
            logger = logging.getLogger(__name__)
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            class TrackioSpace:
         | 
| 19 | 
            +
                """Trackio deployment for Hugging Face Spaces"""
         | 
| 20 | 
            +
                
         | 
| 21 | 
            +
                def __init__(self):
         | 
| 22 | 
            +
                    self.experiments = {}
         | 
| 23 | 
            +
                    self.current_experiment = None
         | 
| 24 | 
            +
                    
         | 
| 25 | 
            +
                def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
         | 
| 26 | 
            +
                    """Create a new experiment"""
         | 
| 27 | 
            +
                    experiment_id = f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         | 
| 28 | 
            +
                    
         | 
| 29 | 
            +
                    experiment = {
         | 
| 30 | 
            +
                        'id': experiment_id,
         | 
| 31 | 
            +
                        'name': name,
         | 
| 32 | 
            +
                        'description': description,
         | 
| 33 | 
            +
                        'created_at': datetime.now().isoformat(),
         | 
| 34 | 
            +
                        'status': 'running',
         | 
| 35 | 
            +
                        'metrics': [],
         | 
| 36 | 
            +
                        'parameters': {},
         | 
| 37 | 
            +
                        'artifacts': [],
         | 
| 38 | 
            +
                        'logs': []
         | 
| 39 | 
            +
                    }
         | 
| 40 | 
            +
                    
         | 
| 41 | 
            +
                    self.experiments[experiment_id] = experiment
         | 
| 42 | 
            +
                    self.current_experiment = experiment_id
         | 
| 43 | 
            +
                    
         | 
| 44 | 
            +
                    logger.info(f"Created experiment: {experiment_id} - {name}")
         | 
| 45 | 
            +
                    return experiment
         | 
| 46 | 
            +
                
         | 
| 47 | 
            +
                def log_metrics(self, experiment_id: str, metrics: Dict[str, Any], step: Optional[int] = None):
         | 
| 48 | 
            +
                    """Log metrics for an experiment"""
         | 
| 49 | 
            +
                    if experiment_id not in self.experiments:
         | 
| 50 | 
            +
                        raise ValueError(f"Experiment {experiment_id} not found")
         | 
| 51 | 
            +
                    
         | 
| 52 | 
            +
                    metric_entry = {
         | 
| 53 | 
            +
                        'timestamp': datetime.now().isoformat(),
         | 
| 54 | 
            +
                        'step': step,
         | 
| 55 | 
            +
                        'metrics': metrics
         | 
| 56 | 
            +
                    }
         | 
| 57 | 
            +
                    
         | 
| 58 | 
            +
                    self.experiments[experiment_id]['metrics'].append(metric_entry)
         | 
| 59 | 
            +
                    logger.info(f"Logged metrics for experiment {experiment_id}: {metrics}")
         | 
| 60 | 
            +
                
         | 
| 61 | 
            +
                def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]):
         | 
| 62 | 
            +
                    """Log parameters for an experiment"""
         | 
| 63 | 
            +
                    if experiment_id not in self.experiments:
         | 
| 64 | 
            +
                        raise ValueError(f"Experiment {experiment_id} not found")
         | 
| 65 | 
            +
                    
         | 
| 66 | 
            +
                    self.experiments[experiment_id]['parameters'].update(parameters)
         | 
| 67 | 
            +
                    logger.info(f"Logged parameters for experiment {experiment_id}: {parameters}")
         | 
| 68 | 
            +
                
         | 
| 69 | 
            +
                def log_artifact(self, experiment_id: str, artifact_name: str, artifact_data: str):
         | 
| 70 | 
            +
                    """Log an artifact for an experiment"""
         | 
| 71 | 
            +
                    if experiment_id not in self.experiments:
         | 
| 72 | 
            +
                        raise ValueError(f"Experiment {experiment_id} not found")
         | 
| 73 | 
            +
                    
         | 
| 74 | 
            +
                    artifact_entry = {
         | 
| 75 | 
            +
                        'name': artifact_name,
         | 
| 76 | 
            +
                        'timestamp': datetime.now().isoformat(),
         | 
| 77 | 
            +
                        'data': artifact_data
         | 
| 78 | 
            +
                    }
         | 
| 79 | 
            +
                    
         | 
| 80 | 
            +
                    self.experiments[experiment_id]['artifacts'].append(artifact_entry)
         | 
| 81 | 
            +
                    logger.info(f"Logged artifact for experiment {experiment_id}: {artifact_name}")
         | 
| 82 | 
            +
                
         | 
| 83 | 
            +
                def get_experiment(self, experiment_id: str) -> Optional[Dict[str, Any]]:
         | 
| 84 | 
            +
                    """Get experiment details"""
         | 
| 85 | 
            +
                    return self.experiments.get(experiment_id)
         | 
| 86 | 
            +
                
         | 
| 87 | 
            +
                def list_experiments(self) -> Dict[str, Any]:
         | 
| 88 | 
            +
                    """List all experiments"""
         | 
| 89 | 
            +
                    return {
         | 
| 90 | 
            +
                        'experiments': list(self.experiments.keys()),
         | 
| 91 | 
            +
                        'current_experiment': self.current_experiment,
         | 
| 92 | 
            +
                        'total_experiments': len(self.experiments)
         | 
| 93 | 
            +
                    }
         | 
| 94 | 
            +
                
         | 
| 95 | 
            +
                def update_experiment_status(self, experiment_id: str, status: str):
         | 
| 96 | 
            +
                    """Update experiment status"""
         | 
| 97 | 
            +
                    if experiment_id in self.experiments:
         | 
| 98 | 
            +
                        self.experiments[experiment_id]['status'] = status
         | 
| 99 | 
            +
                        logger.info(f"Updated experiment {experiment_id} status to {status}")
         | 
| 100 | 
            +
             | 
| 101 | 
            +
            # Initialize Trackio space
         | 
| 102 | 
            +
            trackio_space = TrackioSpace()
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            def create_experiment_interface(name: str, description: str) -> str:
         | 
| 105 | 
            +
                """Create a new experiment"""
         | 
| 106 | 
            +
                try:
         | 
| 107 | 
            +
                    experiment = trackio_space.create_experiment(name, description)
         | 
| 108 | 
            +
                    return f"✅ Experiment created successfully!\nID: {experiment['id']}\nName: {experiment['name']}"
         | 
| 109 | 
            +
                except Exception as e:
         | 
| 110 | 
            +
                    return f"❌ Error creating experiment: {str(e)}"
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            def log_metrics_interface(experiment_id: str, metrics_json: str, step: str) -> str:
         | 
| 113 | 
            +
                """Log metrics for an experiment"""
         | 
| 114 | 
            +
                try:
         | 
| 115 | 
            +
                    metrics = json.loads(metrics_json)
         | 
| 116 | 
            +
                    step_int = int(step) if step else None
         | 
| 117 | 
            +
                    trackio_space.log_metrics(experiment_id, metrics, step_int)
         | 
| 118 | 
            +
                    return f"✅ Metrics logged successfully for experiment {experiment_id}"
         | 
| 119 | 
            +
                except Exception as e:
         | 
| 120 | 
            +
                    return f"❌ Error logging metrics: {str(e)}"
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            def log_parameters_interface(experiment_id: str, parameters_json: str) -> str:
         | 
| 123 | 
            +
                """Log parameters for an experiment"""
         | 
| 124 | 
            +
                try:
         | 
| 125 | 
            +
                    parameters = json.loads(parameters_json)
         | 
| 126 | 
            +
                    trackio_space.log_parameters(experiment_id, parameters)
         | 
| 127 | 
            +
                    return f"✅ Parameters logged successfully for experiment {experiment_id}"
         | 
| 128 | 
            +
                except Exception as e:
         | 
| 129 | 
            +
                    return f"❌ Error logging parameters: {str(e)}"
         | 
| 130 | 
            +
             | 
| 131 | 
            +
            def get_experiment_details(experiment_id: str) -> str:
         | 
| 132 | 
            +
                """Get experiment details"""
         | 
| 133 | 
            +
                try:
         | 
| 134 | 
            +
                    experiment = trackio_space.get_experiment(experiment_id)
         | 
| 135 | 
            +
                    if experiment:
         | 
| 136 | 
            +
                        return json.dumps(experiment, indent=2)
         | 
| 137 | 
            +
                    else:
         | 
| 138 | 
            +
                        return f"❌ Experiment {experiment_id} not found"
         | 
| 139 | 
            +
                except Exception as e:
         | 
| 140 | 
            +
                    return f"❌ Error getting experiment details: {str(e)}"
         | 
| 141 | 
            +
             | 
| 142 | 
            +
            def list_experiments_interface() -> str:
         | 
| 143 | 
            +
                """List all experiments"""
         | 
| 144 | 
            +
                try:
         | 
| 145 | 
            +
                    experiments_info = trackio_space.list_experiments()
         | 
| 146 | 
            +
                    return json.dumps(experiments_info, indent=2)
         | 
| 147 | 
            +
                except Exception as e:
         | 
| 148 | 
            +
                    return f"❌ Error listing experiments: {str(e)}"
         | 
| 149 | 
            +
             | 
| 150 | 
            +
            def update_experiment_status_interface(experiment_id: str, status: str) -> str:
         | 
| 151 | 
            +
                """Update experiment status"""
         | 
| 152 | 
            +
                try:
         | 
| 153 | 
            +
                    trackio_space.update_experiment_status(experiment_id, status)
         | 
| 154 | 
            +
                    return f"✅ Experiment {experiment_id} status updated to {status}"
         | 
| 155 | 
            +
                except Exception as e:
         | 
| 156 | 
            +
                    return f"❌ Error updating experiment status: {str(e)}"
         | 
| 157 | 
            +
             | 
| 158 | 
            +
            # Create Gradio interface
         | 
| 159 | 
            +
            with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as demo:
         | 
| 160 | 
            +
                gr.Markdown("# 🚀 Trackio Experiment Tracking")
         | 
| 161 | 
            +
                gr.Markdown("Monitor and track your ML experiments with ease!")
         | 
| 162 | 
            +
                
         | 
| 163 | 
            +
                with gr.Tabs():
         | 
| 164 | 
            +
                    # Create Experiment Tab
         | 
| 165 | 
            +
                    with gr.Tab("Create Experiment"):
         | 
| 166 | 
            +
                        gr.Markdown("### Create a New Experiment")
         | 
| 167 | 
            +
                        with gr.Row():
         | 
| 168 | 
            +
                            with gr.Column():
         | 
| 169 | 
            +
                                experiment_name = gr.Textbox(
         | 
| 170 | 
            +
                                    label="Experiment Name",
         | 
| 171 | 
            +
                                    placeholder="my_smollm3_finetune",
         | 
| 172 | 
            +
                                    value="smollm3_finetune"
         | 
| 173 | 
            +
                                )
         | 
| 174 | 
            +
                                experiment_description = gr.Textbox(
         | 
| 175 | 
            +
                                    label="Description",
         | 
| 176 | 
            +
                                    placeholder="Fine-tuning SmolLM3 model on custom dataset",
         | 
| 177 | 
            +
                                    value="SmolLM3 fine-tuning experiment"
         | 
| 178 | 
            +
                                )
         | 
| 179 | 
            +
                                create_btn = gr.Button("Create Experiment", variant="primary")
         | 
| 180 | 
            +
                            
         | 
| 181 | 
            +
                            with gr.Column():
         | 
| 182 | 
            +
                                create_output = gr.Textbox(
         | 
| 183 | 
            +
                                    label="Result",
         | 
| 184 | 
            +
                                    lines=5,
         | 
| 185 | 
            +
                                    interactive=False
         | 
| 186 | 
            +
                                )
         | 
| 187 | 
            +
                        
         | 
| 188 | 
            +
                        create_btn.click(
         | 
| 189 | 
            +
                            create_experiment_interface,
         | 
| 190 | 
            +
                            inputs=[experiment_name, experiment_description],
         | 
| 191 | 
            +
                            outputs=create_output
         | 
| 192 | 
            +
                        )
         | 
| 193 | 
            +
                    
         | 
| 194 | 
            +
                    # Log Metrics Tab
         | 
| 195 | 
            +
                    with gr.Tab("Log Metrics"):
         | 
| 196 | 
            +
                        gr.Markdown("### Log Training Metrics")
         | 
| 197 | 
            +
                        with gr.Row():
         | 
| 198 | 
            +
                            with gr.Column():
         | 
| 199 | 
            +
                                metrics_exp_id = gr.Textbox(
         | 
| 200 | 
            +
                                    label="Experiment ID",
         | 
| 201 | 
            +
                                    placeholder="exp_20231201_143022"
         | 
| 202 | 
            +
                                )
         | 
| 203 | 
            +
                                metrics_json = gr.Textbox(
         | 
| 204 | 
            +
                                    label="Metrics (JSON)",
         | 
| 205 | 
            +
                                    placeholder='{"loss": 0.5, "accuracy": 0.85}',
         | 
| 206 | 
            +
                                    value='{"loss": 0.5, "accuracy": 0.85}'
         | 
| 207 | 
            +
                                )
         | 
| 208 | 
            +
                                metrics_step = gr.Textbox(
         | 
| 209 | 
            +
                                    label="Step (optional)",
         | 
| 210 | 
            +
                                    placeholder="100"
         | 
| 211 | 
            +
                                )
         | 
| 212 | 
            +
                                log_metrics_btn = gr.Button("Log Metrics", variant="primary")
         | 
| 213 | 
            +
                            
         | 
| 214 | 
            +
                            with gr.Column():
         | 
| 215 | 
            +
                                metrics_output = gr.Textbox(
         | 
| 216 | 
            +
                                    label="Result",
         | 
| 217 | 
            +
                                    lines=3,
         | 
| 218 | 
            +
                                    interactive=False
         | 
| 219 | 
            +
                                )
         | 
| 220 | 
            +
                        
         | 
| 221 | 
            +
                        log_metrics_btn.click(
         | 
| 222 | 
            +
                            log_metrics_interface,
         | 
| 223 | 
            +
                            inputs=[metrics_exp_id, metrics_json, metrics_step],
         | 
| 224 | 
            +
                            outputs=metrics_output
         | 
| 225 | 
            +
                        )
         | 
| 226 | 
            +
                    
         | 
| 227 | 
            +
                    # Log Parameters Tab
         | 
| 228 | 
            +
                    with gr.Tab("Log Parameters"):
         | 
| 229 | 
            +
                        gr.Markdown("### Log Experiment Parameters")
         | 
| 230 | 
            +
                        with gr.Row():
         | 
| 231 | 
            +
                            with gr.Column():
         | 
| 232 | 
            +
                                params_exp_id = gr.Textbox(
         | 
| 233 | 
            +
                                    label="Experiment ID",
         | 
| 234 | 
            +
                                    placeholder="exp_20231201_143022"
         | 
| 235 | 
            +
                                )
         | 
| 236 | 
            +
                                parameters_json = gr.Textbox(
         | 
| 237 | 
            +
                                    label="Parameters (JSON)",
         | 
| 238 | 
            +
                                    placeholder='{"learning_rate": 2e-5, "batch_size": 4}',
         | 
| 239 | 
            +
                                    value='{"learning_rate": 2e-5, "batch_size": 4, "model_name": "HuggingFaceTB/SmolLM3-3B"}'
         | 
| 240 | 
            +
                                )
         | 
| 241 | 
            +
                                log_params_btn = gr.Button("Log Parameters", variant="primary")
         | 
| 242 | 
            +
                            
         | 
| 243 | 
            +
                            with gr.Column():
         | 
| 244 | 
            +
                                params_output = gr.Textbox(
         | 
| 245 | 
            +
                                    label="Result",
         | 
| 246 | 
            +
                                    lines=3,
         | 
| 247 | 
            +
                                    interactive=False
         | 
| 248 | 
            +
                                )
         | 
| 249 | 
            +
                        
         | 
| 250 | 
            +
                        log_params_btn.click(
         | 
| 251 | 
            +
                            log_parameters_interface,
         | 
| 252 | 
            +
                            inputs=[params_exp_id, parameters_json],
         | 
| 253 | 
            +
                            outputs=params_output
         | 
| 254 | 
            +
                        )
         | 
| 255 | 
            +
                    
         | 
| 256 | 
            +
                    # View Experiments Tab
         | 
| 257 | 
            +
                    with gr.Tab("View Experiments"):
         | 
| 258 | 
            +
                        gr.Markdown("### View Experiment Details")
         | 
| 259 | 
            +
                        with gr.Row():
         | 
| 260 | 
            +
                            with gr.Column():
         | 
| 261 | 
            +
                                view_exp_id = gr.Textbox(
         | 
| 262 | 
            +
                                    label="Experiment ID",
         | 
| 263 | 
            +
                                    placeholder="exp_20231201_143022"
         | 
| 264 | 
            +
                                )
         | 
| 265 | 
            +
                                view_btn = gr.Button("View Experiment", variant="primary")
         | 
| 266 | 
            +
                                list_btn = gr.Button("List All Experiments", variant="secondary")
         | 
| 267 | 
            +
                            
         | 
| 268 | 
            +
                            with gr.Column():
         | 
| 269 | 
            +
                                view_output = gr.Textbox(
         | 
| 270 | 
            +
                                    label="Experiment Details",
         | 
| 271 | 
            +
                                    lines=15,
         | 
| 272 | 
            +
                                    interactive=False
         | 
| 273 | 
            +
                                )
         | 
| 274 | 
            +
                        
         | 
| 275 | 
            +
                        view_btn.click(
         | 
| 276 | 
            +
                            get_experiment_details,
         | 
| 277 | 
            +
                            inputs=[view_exp_id],
         | 
| 278 | 
            +
                            outputs=view_output
         | 
| 279 | 
            +
                        )
         | 
| 280 | 
            +
                        
         | 
| 281 | 
            +
                        list_btn.click(
         | 
| 282 | 
            +
                            list_experiments_interface,
         | 
| 283 | 
            +
                            inputs=[],
         | 
| 284 | 
            +
                            outputs=view_output
         | 
| 285 | 
            +
                        )
         | 
| 286 | 
            +
                    
         | 
| 287 | 
            +
                    # Update Status Tab
         | 
| 288 | 
            +
                    with gr.Tab("Update Status"):
         | 
| 289 | 
            +
                        gr.Markdown("### Update Experiment Status")
         | 
| 290 | 
            +
                        with gr.Row():
         | 
| 291 | 
            +
                            with gr.Column():
         | 
| 292 | 
            +
                                status_exp_id = gr.Textbox(
         | 
| 293 | 
            +
                                    label="Experiment ID",
         | 
| 294 | 
            +
                                    placeholder="exp_20231201_143022"
         | 
| 295 | 
            +
                                )
         | 
| 296 | 
            +
                                status_dropdown = gr.Dropdown(
         | 
| 297 | 
            +
                                    label="Status",
         | 
| 298 | 
            +
                                    choices=["running", "completed", "failed", "paused"],
         | 
| 299 | 
            +
                                    value="running"
         | 
| 300 | 
            +
                                )
         | 
| 301 | 
            +
                                update_status_btn = gr.Button("Update Status", variant="primary")
         | 
| 302 | 
            +
                            
         | 
| 303 | 
            +
                            with gr.Column():
         | 
| 304 | 
            +
                                status_output = gr.Textbox(
         | 
| 305 | 
            +
                                    label="Result",
         | 
| 306 | 
            +
                                    lines=3,
         | 
| 307 | 
            +
                                    interactive=False
         | 
| 308 | 
            +
                                )
         | 
| 309 | 
            +
                        
         | 
| 310 | 
            +
                        update_status_btn.click(
         | 
| 311 | 
            +
                            update_experiment_status_interface,
         | 
| 312 | 
            +
                            inputs=[status_exp_id, status_dropdown],
         | 
| 313 | 
            +
                            outputs=status_output
         | 
| 314 | 
            +
                        )
         | 
| 315 | 
            +
             | 
| 316 | 
            +
            # Launch the app
         | 
| 317 | 
            +
            if __name__ == "__main__":
         | 
| 318 | 
            +
                demo.launch() 
         | 
    	
        cloud_deployment.sh
    ADDED
    
    | @@ -0,0 +1,279 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/bin/bash
         | 
| 2 | 
            +
            # Cloud Deployment Script for SmolLM3 DPO Training
         | 
| 3 | 
            +
            # This script sets up a cloud instance for training and uploading to Hugging Face
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            set -e  # Exit on any error
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            echo "🚀 Starting SmolLM3 DPO Cloud Deployment"
         | 
| 8 | 
            +
            echo "=========================================="
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            # Configuration
         | 
| 11 | 
            +
            MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
         | 
| 12 | 
            +
            DATASET_NAME="HuggingFaceTB/smoltalk"
         | 
| 13 | 
            +
            EXPERIMENT_NAME="smollm3_dpo_6epochs"
         | 
| 14 | 
            +
            REPO_NAME="your-username/smollm3-dpo-6epochs"  # Change this to your username
         | 
| 15 | 
            +
            TRACKIO_URL="https://your-trackio-space.hf.space"  # Change this to your Trackio Space URL
         | 
| 16 | 
            +
            HF_TOKEN="your_hf_token_here"  # Change this to your HF token
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            # Training Configuration
         | 
| 19 | 
            +
            BATCH_SIZE=2
         | 
| 20 | 
            +
            GRADIENT_ACCUMULATION_STEPS=8
         | 
| 21 | 
            +
            LEARNING_RATE=5e-6
         | 
| 22 | 
            +
            MAX_EPOCHS=6
         | 
| 23 | 
            +
            MAX_SEQ_LENGTH=4096
         | 
| 24 | 
            +
            SAVE_STEPS=500
         | 
| 25 | 
            +
            EVAL_STEPS=100
         | 
| 26 | 
            +
            LOGGING_STEPS=10
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            echo "📋 Configuration:"
         | 
| 29 | 
            +
            echo "  Model: $MODEL_NAME"
         | 
| 30 | 
            +
            echo "  Dataset: $DATASET_NAME"
         | 
| 31 | 
            +
            echo "  Experiment: $EXPERIMENT_NAME"
         | 
| 32 | 
            +
            echo "  Repository: $REPO_NAME"
         | 
| 33 | 
            +
            echo "  Epochs: $MAX_EPOCHS"
         | 
| 34 | 
            +
            echo "  Batch Size: $BATCH_SIZE"
         | 
| 35 | 
            +
            echo "  Learning Rate: $LEARNING_RATE"
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            # Step 1: Update system and install dependencies
         | 
| 38 | 
            +
            echo ""
         | 
| 39 | 
            +
            echo "🔧 Step 1: Installing system dependencies..."
         | 
| 40 | 
            +
            sudo apt-get update
         | 
| 41 | 
            +
            sudo apt-get install -y git curl wget unzip
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            # Step 2: Install Python and pip
         | 
| 44 | 
            +
            echo ""
         | 
| 45 | 
            +
            echo "🐍 Step 2: Installing Python dependencies..."
         | 
| 46 | 
            +
            sudo apt-get install -y python3 python3-pip python3-venv
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            # Step 3: Create virtual environment
         | 
| 49 | 
            +
            echo ""
         | 
| 50 | 
            +
            echo "📦 Step 3: Setting up Python virtual environment..."
         | 
| 51 | 
            +
            python3 -m venv smollm3_env
         | 
| 52 | 
            +
            source smollm3_env/bin/activate
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            # Step 4: Install PyTorch and CUDA
         | 
| 55 | 
            +
            echo ""
         | 
| 56 | 
            +
            echo "🔥 Step 4: Installing PyTorch with CUDA support..."
         | 
| 57 | 
            +
            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            # Step 5: Install project dependencies
         | 
| 60 | 
            +
            echo ""
         | 
| 61 | 
            +
            echo "📚 Step 5: Installing project dependencies..."
         | 
| 62 | 
            +
            pip install -r requirements.txt
         | 
| 63 | 
            +
             | 
| 64 | 
            +
            # Step 6: Install additional dependencies for DPO
         | 
| 65 | 
            +
            echo ""
         | 
| 66 | 
            +
            echo "🎯 Step 6: Installing DPO-specific dependencies..."
         | 
| 67 | 
            +
            pip install trl>=0.7.0
         | 
| 68 | 
            +
            pip install peft>=0.4.0
         | 
| 69 | 
            +
            pip install accelerate>=0.20.0
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            # Step 7: Set up Hugging Face token
         | 
| 72 | 
            +
            echo ""
         | 
| 73 | 
            +
            echo "🔑 Step 7: Setting up Hugging Face authentication..."
         | 
| 74 | 
            +
            export HF_TOKEN="$HF_TOKEN"
         | 
| 75 | 
            +
            huggingface-cli login --token $HF_TOKEN
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            # Step 8: Create DPO configuration
         | 
| 78 | 
            +
            echo ""
         | 
| 79 | 
            +
            echo "⚙️ Step 8: Creating DPO configuration..."
         | 
| 80 | 
            +
            cat > config/train_smollm3_dpo_6epochs.py << EOF
         | 
| 81 | 
            +
            """
         | 
| 82 | 
            +
            SmolLM3 DPO Training Configuration - 6 Epochs
         | 
| 83 | 
            +
            Optimized for cloud deployment
         | 
| 84 | 
            +
            """
         | 
| 85 | 
            +
             | 
| 86 | 
            +
            from config.train_smollm3_dpo import SmolLM3DPOConfig
         | 
| 87 | 
            +
             | 
| 88 | 
            +
            config = SmolLM3DPOConfig(
         | 
| 89 | 
            +
                # Model configuration
         | 
| 90 | 
            +
                model_name="$MODEL_NAME",
         | 
| 91 | 
            +
                max_seq_length=$MAX_SEQ_LENGTH,
         | 
| 92 | 
            +
                use_flash_attention=True,
         | 
| 93 | 
            +
                use_gradient_checkpointing=True,
         | 
| 94 | 
            +
                
         | 
| 95 | 
            +
                # Training configuration
         | 
| 96 | 
            +
                batch_size=$BATCH_SIZE,
         | 
| 97 | 
            +
                gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS,
         | 
| 98 | 
            +
                learning_rate=$LEARNING_RATE,
         | 
| 99 | 
            +
                weight_decay=0.01,
         | 
| 100 | 
            +
                warmup_steps=100,
         | 
| 101 | 
            +
                max_iters=None,  # Will be calculated based on epochs
         | 
| 102 | 
            +
                eval_interval=100,
         | 
| 103 | 
            +
                log_interval=10,
         | 
| 104 | 
            +
                save_interval=500,
         | 
| 105 | 
            +
                
         | 
| 106 | 
            +
                # DPO configuration
         | 
| 107 | 
            +
                beta=0.1,
         | 
| 108 | 
            +
                max_prompt_length=$((MAX_SEQ_LENGTH // 2)),
         | 
| 109 | 
            +
                
         | 
| 110 | 
            +
                # Optimizer configuration
         | 
| 111 | 
            +
                optimizer="adamw",
         | 
| 112 | 
            +
                beta1=0.9,
         | 
| 113 | 
            +
                beta2=0.95,
         | 
| 114 | 
            +
                eps=1e-8,
         | 
| 115 | 
            +
                
         | 
| 116 | 
            +
                # Scheduler configuration
         | 
| 117 | 
            +
                scheduler="cosine",
         | 
| 118 | 
            +
                min_lr=1e-6,
         | 
| 119 | 
            +
                
         | 
| 120 | 
            +
                # Mixed precision
         | 
| 121 | 
            +
                fp16=True,
         | 
| 122 | 
            +
                bf16=False,
         | 
| 123 | 
            +
                
         | 
| 124 | 
            +
                # Logging and saving
         | 
| 125 | 
            +
                save_steps=$SAVE_STEPS,
         | 
| 126 | 
            +
                eval_steps=$EVAL_STEPS,
         | 
| 127 | 
            +
                logging_steps=$LOGGING_STEPS,
         | 
| 128 | 
            +
                save_total_limit=3,
         | 
| 129 | 
            +
                
         | 
| 130 | 
            +
                # Evaluation
         | 
| 131 | 
            +
                eval_strategy="steps",
         | 
| 132 | 
            +
                metric_for_best_model="eval_loss",
         | 
| 133 | 
            +
                greater_is_better=False,
         | 
| 134 | 
            +
                load_best_model_at_end=True,
         | 
| 135 | 
            +
                
         | 
| 136 | 
            +
                # Data configuration
         | 
| 137 | 
            +
                data_dir="smoltalk_dataset",
         | 
| 138 | 
            +
                train_file="train.json",
         | 
| 139 | 
            +
                validation_file="validation.json",
         | 
| 140 | 
            +
                
         | 
| 141 | 
            +
                # Chat template configuration
         | 
| 142 | 
            +
                use_chat_template=True,
         | 
| 143 | 
            +
                chat_template_kwargs={
         | 
| 144 | 
            +
                    "enable_thinking": False,
         | 
| 145 | 
            +
                    "add_generation_prompt": True
         | 
| 146 | 
            +
                },
         | 
| 147 | 
            +
                
         | 
| 148 | 
            +
                # Trackio monitoring configuration
         | 
| 149 | 
            +
                enable_tracking=True,
         | 
| 150 | 
            +
                trackio_url="$TRACKIO_URL",
         | 
| 151 | 
            +
                trackio_token=None,
         | 
| 152 | 
            +
                log_artifacts=True,
         | 
| 153 | 
            +
                log_metrics=True,
         | 
| 154 | 
            +
                log_config=True,
         | 
| 155 | 
            +
                experiment_name="$EXPERIMENT_NAME"
         | 
| 156 | 
            +
            )
         | 
| 157 | 
            +
            EOF
         | 
| 158 | 
            +
             | 
| 159 | 
            +
            # Step 9: Download and prepare dataset
         | 
| 160 | 
            +
            echo ""
         | 
| 161 | 
            +
            echo "📊 Step 9: Downloading and preparing dataset..."
         | 
| 162 | 
            +
            python -c "
         | 
| 163 | 
            +
            from datasets import load_dataset
         | 
| 164 | 
            +
            import json
         | 
| 165 | 
            +
            import os
         | 
| 166 | 
            +
             | 
| 167 | 
            +
            # Load SmolTalk dataset
         | 
| 168 | 
            +
            print('Loading SmolTalk dataset...')
         | 
| 169 | 
            +
            dataset = load_dataset('$DATASET_NAME')
         | 
| 170 | 
            +
             | 
| 171 | 
            +
            # Create dataset directory
         | 
| 172 | 
            +
            os.makedirs('smoltalk_dataset', exist_ok=True)
         | 
| 173 | 
            +
             | 
| 174 | 
            +
            # Convert to DPO format (preference pairs)
         | 
| 175 | 
            +
            def convert_to_dpo_format(example):
         | 
| 176 | 
            +
                # For SmolTalk, we'll create preference pairs based on response quality
         | 
| 177 | 
            +
                # This is a simplified example - you may need to adjust based on your needs
         | 
| 178 | 
            +
                return {
         | 
| 179 | 
            +
                    'prompt': example.get('prompt', ''),
         | 
| 180 | 
            +
                    'chosen': example.get('chosen', ''),
         | 
| 181 | 
            +
                    'rejected': example.get('rejected', '')
         | 
| 182 | 
            +
                }
         | 
| 183 | 
            +
             | 
| 184 | 
            +
            # Process train split
         | 
| 185 | 
            +
            train_data = []
         | 
| 186 | 
            +
            for example in dataset['train']:
         | 
| 187 | 
            +
                dpo_example = convert_to_dpo_format(example)
         | 
| 188 | 
            +
                if dpo_example['prompt'] and dpo_example['chosen'] and dpo_example['rejected']:
         | 
| 189 | 
            +
                    train_data.append(dpo_example)
         | 
| 190 | 
            +
             | 
| 191 | 
            +
            # Process validation split
         | 
| 192 | 
            +
            val_data = []
         | 
| 193 | 
            +
            for example in dataset['validation']:
         | 
| 194 | 
            +
                dpo_example = convert_to_dpo_format(example)
         | 
| 195 | 
            +
                if dpo_example['prompt'] and dpo_example['chosen'] and dpo_example['rejected']:
         | 
| 196 | 
            +
                    val_data.append(dpo_example)
         | 
| 197 | 
            +
             | 
| 198 | 
            +
            # Save to files
         | 
| 199 | 
            +
            with open('smoltalk_dataset/train.json', 'w') as f:
         | 
| 200 | 
            +
                json.dump(train_data, f, indent=2)
         | 
| 201 | 
            +
             | 
| 202 | 
            +
            with open('smoltalk_dataset/validation.json', 'w') as f:
         | 
| 203 | 
            +
                json.dump(val_data, f, indent=2)
         | 
| 204 | 
            +
             | 
| 205 | 
            +
            print(f'Dataset prepared: {len(train_data)} train samples, {len(val_data)} validation samples')
         | 
| 206 | 
            +
            "
         | 
| 207 | 
            +
             | 
| 208 | 
            +
            # Step 10: Calculate training steps based on epochs
         | 
| 209 | 
            +
            echo ""
         | 
| 210 | 
            +
            echo "📈 Step 10: Calculating training parameters..."
         | 
| 211 | 
            +
            TOTAL_SAMPLES=$(python -c "import json; data=json.load(open('smoltalk_dataset/train.json')); print(len(data))")
         | 
| 212 | 
            +
            EFFECTIVE_BATCH_SIZE=$((BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS))
         | 
| 213 | 
            +
            STEPS_PER_EPOCH=$((TOTAL_SAMPLES / EFFECTIVE_BATCH_SIZE))
         | 
| 214 | 
            +
            MAX_STEPS=$((STEPS_PER_EPOCH * MAX_EPOCHS))
         | 
| 215 | 
            +
             | 
| 216 | 
            +
            echo "  Total samples: $TOTAL_SAMPLES"
         | 
| 217 | 
            +
            echo "  Effective batch size: $EFFECTIVE_BATCH_SIZE"
         | 
| 218 | 
            +
            echo "  Steps per epoch: $STEPS_PER_EPOCH"
         | 
| 219 | 
            +
            echo "  Total training steps: $MAX_STEPS"
         | 
| 220 | 
            +
             | 
| 221 | 
            +
            # Step 11: Start DPO training
         | 
| 222 | 
            +
            echo ""
         | 
| 223 | 
            +
            echo "🎯 Step 11: Starting DPO training..."
         | 
| 224 | 
            +
            python train.py config/train_smollm3_dpo_6epochs.py \
         | 
| 225 | 
            +
                --dataset_dir smoltalk_dataset \
         | 
| 226 | 
            +
                --out_dir /output-checkpoint \
         | 
| 227 | 
            +
                --init_from scratch \
         | 
| 228 | 
            +
                --max_iters $MAX_STEPS \
         | 
| 229 | 
            +
                --batch_size $BATCH_SIZE \
         | 
| 230 | 
            +
                --learning_rate $LEARNING_RATE \
         | 
| 231 | 
            +
                --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
         | 
| 232 | 
            +
                --max_seq_length $MAX_SEQ_LENGTH \
         | 
| 233 | 
            +
                --save_steps $SAVE_STEPS \
         | 
| 234 | 
            +
                --eval_steps $EVAL_STEPS \
         | 
| 235 | 
            +
                --logging_steps $LOGGING_STEPS \
         | 
| 236 | 
            +
                --enable_tracking \
         | 
| 237 | 
            +
                --trackio_url "$TRACKIO_URL" \
         | 
| 238 | 
            +
                --experiment_name "$EXPERIMENT_NAME"
         | 
| 239 | 
            +
             | 
| 240 | 
            +
            # Step 12: Push model to Hugging Face Hub
         | 
| 241 | 
            +
            echo ""
         | 
| 242 | 
            +
            echo "📤 Step 12: Pushing model to Hugging Face Hub..."
         | 
| 243 | 
            +
            python push_to_huggingface.py /output-checkpoint "$REPO_NAME" \
         | 
| 244 | 
            +
                --token "$HF_TOKEN" \
         | 
| 245 | 
            +
                --trackio-url "$TRACKIO_URL" \
         | 
| 246 | 
            +
                --experiment-name "$EXPERIMENT_NAME"
         | 
| 247 | 
            +
             | 
| 248 | 
            +
            # Step 13: Test the uploaded model
         | 
| 249 | 
            +
            echo ""
         | 
| 250 | 
            +
            echo "🧪 Step 13: Testing uploaded model..."
         | 
| 251 | 
            +
            python -c "
         | 
| 252 | 
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         | 
| 253 | 
            +
            import torch
         | 
| 254 | 
            +
             | 
| 255 | 
            +
            print('Loading uploaded model...')
         | 
| 256 | 
            +
            model = AutoModelForCausalLM.from_pretrained('$REPO_NAME', torch_dtype=torch.float16, device_map='auto')
         | 
| 257 | 
            +
            tokenizer = AutoTokenizer.from_pretrained('$REPO_NAME')
         | 
| 258 | 
            +
             | 
| 259 | 
            +
            print('Testing model generation...')
         | 
| 260 | 
            +
            prompt = 'Hello, how are you?'
         | 
| 261 | 
            +
            inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
         | 
| 262 | 
            +
            outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, temperature=0.7)
         | 
| 263 | 
            +
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         | 
| 264 | 
            +
            print(f'Prompt: {prompt}')
         | 
| 265 | 
            +
            print(f'Response: {response}')
         | 
| 266 | 
            +
            print('✅ Model test completed successfully!')
         | 
| 267 | 
            +
            "
         | 
| 268 | 
            +
             | 
| 269 | 
            +
            echo ""
         | 
| 270 | 
            +
            echo "🎉 Deployment completed successfully!"
         | 
| 271 | 
            +
            echo "====================================="
         | 
| 272 | 
            +
            echo "📊 Model: https://huggingface.co/$REPO_NAME"
         | 
| 273 | 
            +
            echo "📈 Trackio: $TRACKIO_URL"
         | 
| 274 | 
            +
            echo "📋 Experiment: $EXPERIMENT_NAME"
         | 
| 275 | 
            +
            echo ""
         | 
| 276 | 
            +
            echo "Next steps:"
         | 
| 277 | 
            +
            echo "1. Monitor training progress in your Trackio Space"
         | 
| 278 | 
            +
            echo "2. Check the model repository on Hugging Face Hub"
         | 
| 279 | 
            +
            echo "3. Use the model in your applications" 
         | 
    	
        config/__init__.py
    ADDED
    
    | @@ -0,0 +1,19 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            Configuration package for SmolLM3 training
         | 
| 3 | 
            +
            """
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            from .train_smollm3 import SmolLM3Config, get_config as get_base_config
         | 
| 6 | 
            +
            from .train_smollm3_openhermes_fr import SmolLM3ConfigOpenHermesFR, get_config as get_openhermes_fr_config
         | 
| 7 | 
            +
            from .train_smollm3_openhermes_fr_a100_large import SmolLM3ConfigOpenHermesFRA100Large, get_config as get_a100_large_config
         | 
| 8 | 
            +
            from .train_smollm3_openhermes_fr_a100_multiple_passes import SmolLM3ConfigOpenHermesFRMultiplePasses, get_config as get_multiple_passes_config
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            __all__ = [
         | 
| 11 | 
            +
                'SmolLM3Config',
         | 
| 12 | 
            +
                'SmolLM3ConfigOpenHermesFR', 
         | 
| 13 | 
            +
                'SmolLM3ConfigOpenHermesFRA100Large',
         | 
| 14 | 
            +
                'SmolLM3ConfigOpenHermesFRMultiplePasses',
         | 
| 15 | 
            +
                'get_base_config',
         | 
| 16 | 
            +
                'get_openhermes_fr_config',
         | 
| 17 | 
            +
                'get_a100_large_config',
         | 
| 18 | 
            +
                'get_multiple_passes_config',
         | 
| 19 | 
            +
            ] 
         | 
    	
        config/runpod_config.py
    ADDED
    
    | @@ -0,0 +1,47 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            RunPod Optimized Configuration for SmolLM3 Fine-tuning
         | 
| 3 | 
            +
            Optimized for cloud GPU training on RunPod
         | 
| 4 | 
            +
            """
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from config.train_smollm3 import SmolLM3Config
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            config = SmolLM3Config(
         | 
| 9 | 
            +
                # Model configuration
         | 
| 10 | 
            +
                model_name="HuggingFaceTB/SmolLM3-3B",
         | 
| 11 | 
            +
                max_seq_length=4096,
         | 
| 12 | 
            +
                use_flash_attention=True,
         | 
| 13 | 
            +
                use_gradient_checkpointing=True,
         | 
| 14 | 
            +
                
         | 
| 15 | 
            +
                # Training configuration - optimized for cloud GPUs
         | 
| 16 | 
            +
                batch_size=2,  # Conservative for cloud stability
         | 
| 17 | 
            +
                gradient_accumulation_steps=8,  # Effective batch size = 16
         | 
| 18 | 
            +
                learning_rate=2e-5,
         | 
| 19 | 
            +
                weight_decay=0.01,
         | 
| 20 | 
            +
                warmup_steps=100,
         | 
| 21 | 
            +
                max_iters=1500,
         | 
| 22 | 
            +
                
         | 
| 23 | 
            +
                # Mixed precision for efficiency
         | 
| 24 | 
            +
                fp16=True,
         | 
| 25 | 
            +
                bf16=False,
         | 
| 26 | 
            +
                
         | 
| 27 | 
            +
                # Logging and saving - more frequent for cloud
         | 
| 28 | 
            +
                save_steps=200,
         | 
| 29 | 
            +
                eval_steps=100,
         | 
| 30 | 
            +
                logging_steps=10,
         | 
| 31 | 
            +
                save_total_limit=5,  # Keep more checkpoints
         | 
| 32 | 
            +
                
         | 
| 33 | 
            +
                # Cloud-specific optimizations
         | 
| 34 | 
            +
                ddp_backend="nccl",
         | 
| 35 | 
            +
                ddp_find_unused_parameters=False,
         | 
| 36 | 
            +
                
         | 
| 37 | 
            +
                # Data loading optimizations
         | 
| 38 | 
            +
                dataloader_num_workers=4,
         | 
| 39 | 
            +
                dataloader_pin_memory=True,
         | 
| 40 | 
            +
                
         | 
| 41 | 
            +
                # Chat template configuration
         | 
| 42 | 
            +
                use_chat_template=True,
         | 
| 43 | 
            +
                chat_template_kwargs={
         | 
| 44 | 
            +
                    "enable_thinking": False,
         | 
| 45 | 
            +
                    "add_generation_prompt": True
         | 
| 46 | 
            +
                }
         | 
| 47 | 
            +
            ) 
         | 
    	
        config/train_smollm3.py
    CHANGED
    
    | @@ -68,6 +68,15 @@ class SmolLM3Config: | |
| 68 | 
             
                use_chat_template: bool = True
         | 
| 69 | 
             
                chat_template_kwargs: dict = None
         | 
| 70 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 71 | 
             
                def __post_init__(self):
         | 
| 72 | 
             
                    if self.chat_template_kwargs is None:
         | 
| 73 | 
             
                        self.chat_template_kwargs = {
         | 
|  | |
| 68 | 
             
                use_chat_template: bool = True
         | 
| 69 | 
             
                chat_template_kwargs: dict = None
         | 
| 70 |  | 
| 71 | 
            +
                # Trackio monitoring configuration
         | 
| 72 | 
            +
                enable_tracking: bool = True
         | 
| 73 | 
            +
                trackio_url: Optional[str] = None
         | 
| 74 | 
            +
                trackio_token: Optional[str] = None
         | 
| 75 | 
            +
                log_artifacts: bool = True
         | 
| 76 | 
            +
                log_metrics: bool = True
         | 
| 77 | 
            +
                log_config: bool = True
         | 
| 78 | 
            +
                experiment_name: Optional[str] = None
         | 
| 79 | 
            +
                
         | 
| 80 | 
             
                def __post_init__(self):
         | 
| 81 | 
             
                    if self.chat_template_kwargs is None:
         | 
| 82 | 
             
                        self.chat_template_kwargs = {
         | 
    	
        config/train_smollm3_dpo.py
    CHANGED
    
    | @@ -1,38 +1,95 @@ | |
| 1 | 
             
            """
         | 
| 2 | 
             
            SmolLM3 DPO Training Configuration
         | 
| 3 | 
            -
             | 
| 4 | 
             
            """
         | 
| 5 |  | 
|  | |
|  | |
|  | |
| 6 | 
             
            from config.train_smollm3 import SmolLM3Config
         | 
| 7 |  | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
                 | 
| 11 | 
            -
                max_seq_length=4096,
         | 
| 12 | 
            -
                use_flash_attention=True,
         | 
| 13 | 
            -
                use_gradient_checkpointing=True,
         | 
| 14 |  | 
| 15 | 
            -
                #  | 
| 16 | 
            -
                 | 
| 17 | 
            -
                 | 
| 18 | 
            -
                 | 
| 19 | 
            -
                weight_decay=0.01,
         | 
| 20 | 
            -
                warmup_steps=100,
         | 
| 21 | 
            -
                max_iters=1000,
         | 
| 22 |  | 
| 23 | 
            -
                #  | 
| 24 | 
            -
                 | 
| 25 | 
            -
                 | 
|  | |
| 26 |  | 
| 27 | 
            -
                #  | 
| 28 | 
            -
                 | 
| 29 | 
            -
                 | 
| 30 | 
            -
                logging_steps=20,
         | 
| 31 |  | 
| 32 | 
            -
                #  | 
| 33 | 
            -
                 | 
| 34 | 
            -
                 | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
                 | 
| 38 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            """
         | 
| 2 | 
             
            SmolLM3 DPO Training Configuration
         | 
| 3 | 
            +
            Based on nanoGPT structure but adapted for SmolLM3 DPO training
         | 
| 4 | 
             
            """
         | 
| 5 |  | 
| 6 | 
            +
            import os
         | 
| 7 | 
            +
            from dataclasses import dataclass
         | 
| 8 | 
            +
            from typing import Optional
         | 
| 9 | 
             
            from config.train_smollm3 import SmolLM3Config
         | 
| 10 |  | 
| 11 | 
            +
            @dataclass
         | 
| 12 | 
            +
            class SmolLM3DPOConfig(SmolLM3Config):
         | 
| 13 | 
            +
                """Configuration for SmolLM3 DPO fine-tuning"""
         | 
|  | |
|  | |
|  | |
| 14 |  | 
| 15 | 
            +
                # DPO-specific configuration
         | 
| 16 | 
            +
                beta: float = 0.1
         | 
| 17 | 
            +
                max_prompt_length: int = 2048
         | 
| 18 | 
            +
                max_length: int = 4096
         | 
|  | |
|  | |
|  | |
| 19 |  | 
| 20 | 
            +
                # DPO training configuration
         | 
| 21 | 
            +
                dpo_beta: float = 0.1
         | 
| 22 | 
            +
                dpo_loss_type: str = "sigmoid"  # "sigmoid" or "hinge"
         | 
| 23 | 
            +
                dpo_alpha: float = 0.5
         | 
| 24 |  | 
| 25 | 
            +
                # Reference model configuration
         | 
| 26 | 
            +
                ref_model_name: Optional[str] = None  # If None, will use the same as model_name
         | 
| 27 | 
            +
                ref_model_peft_config: Optional[dict] = None
         | 
|  | |
| 28 |  | 
| 29 | 
            +
                # Preference dataset configuration
         | 
| 30 | 
            +
                preference_dataset_format: str = "dpo"  # "dpo", "rlhf", "custom"
         | 
| 31 | 
            +
                preference_dataset_text_field: str = "text"
         | 
| 32 | 
            +
                preference_dataset_prompt_field: str = "prompt"
         | 
| 33 | 
            +
                preference_dataset_chosen_field: str = "chosen"
         | 
| 34 | 
            +
                preference_dataset_rejected_field: str = "rejected"
         | 
| 35 | 
            +
                
         | 
| 36 | 
            +
                # DPO training arguments
         | 
| 37 | 
            +
                dpo_gradient_checkpointing: bool = True
         | 
| 38 | 
            +
                dpo_gradient_checkpointing_kwargs: dict = None
         | 
| 39 | 
            +
                dpo_precompute_ref_log_probs: bool = False
         | 
| 40 | 
            +
                dpo_peft_config: Optional[dict] = None
         | 
| 41 | 
            +
                
         | 
| 42 | 
            +
                def __post_init__(self):
         | 
| 43 | 
            +
                    super().__post_init__()
         | 
| 44 | 
            +
                    
         | 
| 45 | 
            +
                    # Set default values for DPO-specific settings
         | 
| 46 | 
            +
                    if self.ref_model_name is None:
         | 
| 47 | 
            +
                        self.ref_model_name = self.model_name
         | 
| 48 | 
            +
                    
         | 
| 49 | 
            +
                    if self.dpo_gradient_checkpointing_kwargs is None:
         | 
| 50 | 
            +
                        self.dpo_gradient_checkpointing_kwargs = {
         | 
| 51 | 
            +
                            "use_reentrant": False
         | 
| 52 | 
            +
                        }
         | 
| 53 | 
            +
                    
         | 
| 54 | 
            +
                    if self.dpo_peft_config is None:
         | 
| 55 | 
            +
                        self.dpo_peft_config = {
         | 
| 56 | 
            +
                            "r": 16,
         | 
| 57 | 
            +
                            "lora_alpha": 32,
         | 
| 58 | 
            +
                            "lora_dropout": 0.1,
         | 
| 59 | 
            +
                            "bias": "none",
         | 
| 60 | 
            +
                            "task_type": "CAUSAL_LM"
         | 
| 61 | 
            +
                        }
         | 
| 62 | 
            +
                    
         | 
| 63 | 
            +
                    # Validate DPO configuration
         | 
| 64 | 
            +
                    if self.beta <= 0:
         | 
| 65 | 
            +
                        raise ValueError("beta must be positive")
         | 
| 66 | 
            +
                    
         | 
| 67 | 
            +
                    if self.max_prompt_length > self.max_seq_length:
         | 
| 68 | 
            +
                        raise ValueError("max_prompt_length cannot exceed max_seq_length")
         | 
| 69 | 
            +
                    
         | 
| 70 | 
            +
                    if self.max_length > self.max_seq_length:
         | 
| 71 | 
            +
                        raise ValueError("max_length cannot exceed max_seq_length")
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            def get_dpo_config(config_path: str) -> SmolLM3DPOConfig:
         | 
| 74 | 
            +
                """Load DPO configuration from file or return default"""
         | 
| 75 | 
            +
                if os.path.exists(config_path):
         | 
| 76 | 
            +
                    # Load from file if it exists
         | 
| 77 | 
            +
                    import importlib.util
         | 
| 78 | 
            +
                    spec = importlib.util.spec_from_file_location("config_module", config_path)
         | 
| 79 | 
            +
                    config_module = importlib.util.module_from_spec(spec)
         | 
| 80 | 
            +
                    spec.loader.exec_module(config_module)
         | 
| 81 | 
            +
                    
         | 
| 82 | 
            +
                    if hasattr(config_module, 'config'):
         | 
| 83 | 
            +
                        return config_module.config
         | 
| 84 | 
            +
                    else:
         | 
| 85 | 
            +
                        # Try to find a config class
         | 
| 86 | 
            +
                        for attr_name in dir(config_module):
         | 
| 87 | 
            +
                            attr = getattr(config_module, attr_name)
         | 
| 88 | 
            +
                            if isinstance(attr, SmolLM3DPOConfig):
         | 
| 89 | 
            +
                                return attr
         | 
| 90 | 
            +
                
         | 
| 91 | 
            +
                # Return default configuration
         | 
| 92 | 
            +
                return SmolLM3DPOConfig()
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            # Default DPO configuration instance
         | 
| 95 | 
            +
            config = SmolLM3DPOConfig() 
         | 
    	
        config/train_smollm3_openhermes_fr.py
    ADDED
    
    | @@ -0,0 +1,129 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            SmolLM3 Training Configuration for OpenHermes-FR Dataset
         | 
| 3 | 
            +
            Optimized for French instruction tuning using legmlai/openhermes-fr
         | 
| 4 | 
            +
            """
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import os
         | 
| 7 | 
            +
            from dataclasses import dataclass
         | 
| 8 | 
            +
            from typing import Optional
         | 
| 9 | 
            +
            from config.train_smollm3 import SmolLM3Config
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            @dataclass
         | 
| 12 | 
            +
            class SmolLM3ConfigOpenHermesFR(SmolLM3Config):
         | 
| 13 | 
            +
                """Configuration for SmolLM3 fine-tuning on OpenHermes-FR dataset"""
         | 
| 14 | 
            +
                
         | 
| 15 | 
            +
                # Model configuration
         | 
| 16 | 
            +
                model_name: str = "HuggingFaceTB/SmolLM3-3B"
         | 
| 17 | 
            +
                max_seq_length: int = 4096
         | 
| 18 | 
            +
                use_flash_attention: bool = True
         | 
| 19 | 
            +
                use_gradient_checkpointing: bool = True
         | 
| 20 | 
            +
                
         | 
| 21 | 
            +
                # Training configuration - optimized for French instruction tuning
         | 
| 22 | 
            +
                batch_size: int = 2  # Reduced for French text (longer sequences)
         | 
| 23 | 
            +
                gradient_accumulation_steps: int = 8  # Increased to maintain effective batch size
         | 
| 24 | 
            +
                learning_rate: float = 1e-5  # Slightly lower for instruction tuning
         | 
| 25 | 
            +
                weight_decay: float = 0.01
         | 
| 26 | 
            +
                warmup_steps: int = 500  # More warmup for instruction tuning
         | 
| 27 | 
            +
                max_iters: int = 2000  # More iterations for large dataset
         | 
| 28 | 
            +
                eval_interval: int = 200
         | 
| 29 | 
            +
                log_interval: int = 10
         | 
| 30 | 
            +
                save_interval: int = 500
         | 
| 31 | 
            +
                
         | 
| 32 | 
            +
                # Optimizer configuration
         | 
| 33 | 
            +
                optimizer: str = "adamw"
         | 
| 34 | 
            +
                beta1: float = 0.9
         | 
| 35 | 
            +
                beta2: float = 0.95
         | 
| 36 | 
            +
                eps: float = 1e-8
         | 
| 37 | 
            +
                
         | 
| 38 | 
            +
                # Scheduler configuration
         | 
| 39 | 
            +
                scheduler: str = "cosine"
         | 
| 40 | 
            +
                min_lr: float = 1e-6
         | 
| 41 | 
            +
                
         | 
| 42 | 
            +
                # Mixed precision
         | 
| 43 | 
            +
                fp16: bool = True
         | 
| 44 | 
            +
                bf16: bool = False
         | 
| 45 | 
            +
                
         | 
| 46 | 
            +
                # DDP configuration
         | 
| 47 | 
            +
                ddp_backend: str = "nccl"
         | 
| 48 | 
            +
                ddp_find_unused_parameters: bool = False
         | 
| 49 | 
            +
                
         | 
| 50 | 
            +
                # Logging and saving
         | 
| 51 | 
            +
                save_steps: int = 500
         | 
| 52 | 
            +
                eval_steps: int = 200
         | 
| 53 | 
            +
                logging_steps: int = 10
         | 
| 54 | 
            +
                save_total_limit: Optional[int] = 3
         | 
| 55 | 
            +
                
         | 
| 56 | 
            +
                # Evaluation
         | 
| 57 | 
            +
                eval_strategy: str = "steps"
         | 
| 58 | 
            +
                metric_for_best_model: str = "eval_loss"
         | 
| 59 | 
            +
                greater_is_better: bool = False
         | 
| 60 | 
            +
                load_best_model_at_end: bool = True
         | 
| 61 | 
            +
                
         | 
| 62 | 
            +
                # OpenHermes-FR Dataset configuration
         | 
| 63 | 
            +
                dataset_name: str = "legmlai/openhermes-fr"
         | 
| 64 | 
            +
                dataset_split: str = "train"
         | 
| 65 | 
            +
                input_field: str = "prompt"
         | 
| 66 | 
            +
                target_field: str = "accepted_completion"
         | 
| 67 | 
            +
                filter_bad_entries: bool = True
         | 
| 68 | 
            +
                bad_entry_field: str = "bad_entry"
         | 
| 69 | 
            +
                
         | 
| 70 | 
            +
                # Data configuration (not used for HF datasets but kept for compatibility)
         | 
| 71 | 
            +
                data_dir: str = None
         | 
| 72 | 
            +
                train_file: str = None
         | 
| 73 | 
            +
                validation_file: Optional[str] = None
         | 
| 74 | 
            +
                test_file: Optional[str] = None
         | 
| 75 | 
            +
                
         | 
| 76 | 
            +
                # Chat template configuration
         | 
| 77 | 
            +
                use_chat_template: bool = True
         | 
| 78 | 
            +
                chat_template_kwargs: dict = None
         | 
| 79 | 
            +
                
         | 
| 80 | 
            +
                # Trackio monitoring configuration
         | 
| 81 | 
            +
                enable_tracking: bool = True
         | 
| 82 | 
            +
                trackio_url: Optional[str] = None
         | 
| 83 | 
            +
                trackio_token: Optional[str] = None
         | 
| 84 | 
            +
                log_artifacts: bool = True
         | 
| 85 | 
            +
                log_metrics: bool = True
         | 
| 86 | 
            +
                log_config: bool = True
         | 
| 87 | 
            +
                experiment_name: Optional[str] = None
         | 
| 88 | 
            +
                
         | 
| 89 | 
            +
                def __post_init__(self):
         | 
| 90 | 
            +
                    if self.chat_template_kwargs is None:
         | 
| 91 | 
            +
                        self.chat_template_kwargs = {
         | 
| 92 | 
            +
                            "enable_thinking": False,
         | 
| 93 | 
            +
                            "add_generation_prompt": True
         | 
| 94 | 
            +
                        }
         | 
| 95 | 
            +
                    
         | 
| 96 | 
            +
                    # Validate configuration
         | 
| 97 | 
            +
                    if self.fp16 and self.bf16:
         | 
| 98 | 
            +
                        raise ValueError("Cannot use both fp16 and bf16")
         | 
| 99 | 
            +
                    
         | 
| 100 | 
            +
                    if self.max_seq_length > 131072:  # 128k limit
         | 
| 101 | 
            +
                        raise ValueError("max_seq_length cannot exceed 131072")
         | 
| 102 | 
            +
                    
         | 
| 103 | 
            +
                    # Set default experiment name if not provided
         | 
| 104 | 
            +
                    if self.experiment_name is None:
         | 
| 105 | 
            +
                        self.experiment_name = "smollm3_openhermes_fr"
         | 
| 106 | 
            +
             | 
| 107 | 
            +
            def get_config(config_path: str) -> SmolLM3ConfigOpenHermesFR:
         | 
| 108 | 
            +
                """Load configuration from file or return default"""
         | 
| 109 | 
            +
                if os.path.exists(config_path):
         | 
| 110 | 
            +
                    # Load from file if it exists
         | 
| 111 | 
            +
                    import importlib.util
         | 
| 112 | 
            +
                    spec = importlib.util.spec_from_file_location("config_module", config_path)
         | 
| 113 | 
            +
                    config_module = importlib.util.module_from_spec(spec)
         | 
| 114 | 
            +
                    spec.loader.exec_module(config_module)
         | 
| 115 | 
            +
                    
         | 
| 116 | 
            +
                    if hasattr(config_module, 'config'):
         | 
| 117 | 
            +
                        return config_module.config
         | 
| 118 | 
            +
                    else:
         | 
| 119 | 
            +
                        # Try to find a config class
         | 
| 120 | 
            +
                        for attr_name in dir(config_module):
         | 
| 121 | 
            +
                            attr = getattr(config_module, attr_name)
         | 
| 122 | 
            +
                            if isinstance(attr, SmolLM3ConfigOpenHermesFR):
         | 
| 123 | 
            +
                                return attr
         | 
| 124 | 
            +
                
         | 
| 125 | 
            +
                # Return default configuration
         | 
| 126 | 
            +
                return SmolLM3ConfigOpenHermesFR()
         | 
| 127 | 
            +
             | 
| 128 | 
            +
            # Default configuration instance
         | 
| 129 | 
            +
            config = SmolLM3ConfigOpenHermesFR() 
         | 
    	
        config/train_smollm3_openhermes_fr_a100_large.py
    ADDED
    
    | @@ -0,0 +1,161 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            SmolLM3 Training Configuration for OpenHermes-FR Dataset - A100 Large Scale
         | 
| 3 | 
            +
            Optimized for A100 GPUs with large batch sizes and multiple passes on 800k+ datapoints
         | 
| 4 | 
            +
            """
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import os
         | 
| 7 | 
            +
            from dataclasses import dataclass
         | 
| 8 | 
            +
            from typing import Optional
         | 
| 9 | 
            +
            from config.train_smollm3 import SmolLM3Config
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            @dataclass
         | 
| 12 | 
            +
            class SmolLM3ConfigOpenHermesFRA100Large(SmolLM3Config):
         | 
| 13 | 
            +
                """Configuration for SmolLM3 fine-tuning on OpenHermes-FR dataset - A100 Large Scale"""
         | 
| 14 | 
            +
                
         | 
| 15 | 
            +
                # Model configuration - optimized for A100
         | 
| 16 | 
            +
                model_name: str = "HuggingFaceTB/SmolLM3-3B"
         | 
| 17 | 
            +
                max_seq_length: int = 8192  # Increased for better context understanding
         | 
| 18 | 
            +
                use_flash_attention: bool = True
         | 
| 19 | 
            +
                use_gradient_checkpointing: bool = False  # Disabled for A100 efficiency
         | 
| 20 | 
            +
                
         | 
| 21 | 
            +
                # Training configuration - A100 optimized with large batch sizes
         | 
| 22 | 
            +
                batch_size: int = 8  # Large batch size for A100 (80GB VRAM)
         | 
| 23 | 
            +
                gradient_accumulation_steps: int = 16  # Effective batch size = 8 * 16 = 128
         | 
| 24 | 
            +
                learning_rate: float = 5e-6  # Lower LR for large effective batch size
         | 
| 25 | 
            +
                weight_decay: float = 0.01
         | 
| 26 | 
            +
                warmup_steps: int = 1000  # More warmup for large dataset
         | 
| 27 | 
            +
                max_iters: int = 8000  # Multiple passes on 800k dataset
         | 
| 28 | 
            +
                eval_interval: int = 500  # Less frequent evaluation
         | 
| 29 | 
            +
                log_interval: int = 25  # Less frequent logging
         | 
| 30 | 
            +
                save_interval: int = 1000  # Less frequent saving
         | 
| 31 | 
            +
                
         | 
| 32 | 
            +
                # Optimizer configuration - optimized for large batches
         | 
| 33 | 
            +
                optimizer: str = "adamw"
         | 
| 34 | 
            +
                beta1: float = 0.9
         | 
| 35 | 
            +
                beta2: float = 0.999  # Higher beta2 for stability with large batches
         | 
| 36 | 
            +
                eps: float = 1e-8
         | 
| 37 | 
            +
                
         | 
| 38 | 
            +
                # Scheduler configuration - longer training
         | 
| 39 | 
            +
                scheduler: str = "cosine"
         | 
| 40 | 
            +
                min_lr: float = 5e-7  # Lower min LR
         | 
| 41 | 
            +
                
         | 
| 42 | 
            +
                # Mixed precision - A100 optimized
         | 
| 43 | 
            +
                fp16: bool = False  # Use bf16 for A100
         | 
| 44 | 
            +
                bf16: bool = True  # Better for A100
         | 
| 45 | 
            +
                
         | 
| 46 | 
            +
                # DDP configuration
         | 
| 47 | 
            +
                ddp_backend: str = "nccl"
         | 
| 48 | 
            +
                ddp_find_unused_parameters: bool = False
         | 
| 49 | 
            +
                
         | 
| 50 | 
            +
                # Logging and saving - optimized for long training
         | 
| 51 | 
            +
                save_steps: int = 1000
         | 
| 52 | 
            +
                eval_steps: int = 500
         | 
| 53 | 
            +
                logging_steps: int = 25
         | 
| 54 | 
            +
                save_total_limit: Optional[int] = 5  # Keep more checkpoints
         | 
| 55 | 
            +
                
         | 
| 56 | 
            +
                # Evaluation
         | 
| 57 | 
            +
                eval_strategy: str = "steps"
         | 
| 58 | 
            +
                metric_for_best_model: str = "eval_loss"
         | 
| 59 | 
            +
                greater_is_better: bool = False
         | 
| 60 | 
            +
                load_best_model_at_end: bool = True
         | 
| 61 | 
            +
                
         | 
| 62 | 
            +
                # OpenHermes-FR Dataset configuration
         | 
| 63 | 
            +
                dataset_name: str = "legmlai/openhermes-fr"
         | 
| 64 | 
            +
                dataset_split: str = "train"
         | 
| 65 | 
            +
                input_field: str = "prompt"
         | 
| 66 | 
            +
                target_field: str = "accepted_completion"
         | 
| 67 | 
            +
                filter_bad_entries: bool = True
         | 
| 68 | 
            +
                bad_entry_field: str = "bad_entry"
         | 
| 69 | 
            +
                
         | 
| 70 | 
            +
                # Data configuration (not used for HF datasets but kept for compatibility)
         | 
| 71 | 
            +
                data_dir: str = None
         | 
| 72 | 
            +
                train_file: str = None
         | 
| 73 | 
            +
                validation_file: Optional[str] = None
         | 
| 74 | 
            +
                test_file: Optional[str] = None
         | 
| 75 | 
            +
                
         | 
| 76 | 
            +
                # Chat template configuration
         | 
| 77 | 
            +
                use_chat_template: bool = True
         | 
| 78 | 
            +
                chat_template_kwargs: dict = None
         | 
| 79 | 
            +
                
         | 
| 80 | 
            +
                # Trackio monitoring configuration
         | 
| 81 | 
            +
                enable_tracking: bool = True
         | 
| 82 | 
            +
                trackio_url: Optional[str] = None
         | 
| 83 | 
            +
                trackio_token: Optional[str] = None
         | 
| 84 | 
            +
                log_artifacts: bool = True
         | 
| 85 | 
            +
                log_metrics: bool = True
         | 
| 86 | 
            +
                log_config: bool = True
         | 
| 87 | 
            +
                experiment_name: Optional[str] = None
         | 
| 88 | 
            +
                
         | 
| 89 | 
            +
                # Additional A100 optimizations
         | 
| 90 | 
            +
                dataloader_num_workers: int = 8  # More workers for faster data loading
         | 
| 91 | 
            +
                dataloader_pin_memory: bool = True
         | 
| 92 | 
            +
                dataloader_prefetch_factor: int = 2
         | 
| 93 | 
            +
                
         | 
| 94 | 
            +
                # Memory optimizations
         | 
| 95 | 
            +
                max_grad_norm: float = 1.0  # Gradient clipping
         | 
| 96 | 
            +
                group_by_length: bool = True  # Group similar length sequences
         | 
| 97 | 
            +
                
         | 
| 98 | 
            +
                # Training duration calculations
         | 
| 99 | 
            +
                # With 800k datapoints and effective batch size of 128:
         | 
| 100 | 
            +
                # Steps per epoch = 800,000 / 128 = 6,250 steps
         | 
| 101 | 
            +
                # For 3 passes: 6,250 * 3 = 18,750 steps
         | 
| 102 | 
            +
                # For 5 passes: 6,250 * 5 = 31,250 steps
         | 
| 103 | 
            +
                # Current max_iters = 8,000 (about 1.3 passes)
         | 
| 104 | 
            +
                
         | 
| 105 | 
            +
                def __post_init__(self):
         | 
| 106 | 
            +
                    if self.chat_template_kwargs is None:
         | 
| 107 | 
            +
                        self.chat_template_kwargs = {
         | 
| 108 | 
            +
                            "enable_thinking": False,
         | 
| 109 | 
            +
                            "add_generation_prompt": True
         | 
| 110 | 
            +
                        }
         | 
| 111 | 
            +
                    
         | 
| 112 | 
            +
                    # Validate configuration
         | 
| 113 | 
            +
                    if self.fp16 and self.bf16:
         | 
| 114 | 
            +
                        raise ValueError("Cannot use both fp16 and bf16")
         | 
| 115 | 
            +
                    
         | 
| 116 | 
            +
                    if self.max_seq_length > 131072:  # 128k limit
         | 
| 117 | 
            +
                        raise ValueError("max_seq_length cannot exceed 131072")
         | 
| 118 | 
            +
                    
         | 
| 119 | 
            +
                    # Calculate training statistics
         | 
| 120 | 
            +
                    effective_batch_size = self.batch_size * self.gradient_accumulation_steps
         | 
| 121 | 
            +
                    steps_per_epoch = 800000 // effective_batch_size  # Approximate for 800k dataset
         | 
| 122 | 
            +
                    epochs_for_max_iters = self.max_iters / steps_per_epoch
         | 
| 123 | 
            +
                    
         | 
| 124 | 
            +
                    print(f"=== A100 Large Scale Training Configuration ===")
         | 
| 125 | 
            +
                    print(f"Effective batch size: {effective_batch_size}")
         | 
| 126 | 
            +
                    print(f"Steps per epoch: ~{steps_per_epoch}")
         | 
| 127 | 
            +
                    print(f"Training for ~{epochs_for_max_iters:.1f} epochs")
         | 
| 128 | 
            +
                    print(f"Total training steps: {self.max_iters}")
         | 
| 129 | 
            +
                    print(f"Learning rate: {self.learning_rate}")
         | 
| 130 | 
            +
                    print(f"Mixed precision: {'bf16' if self.bf16 else 'fp16'}")
         | 
| 131 | 
            +
                    print(f"Max sequence length: {self.max_seq_length}")
         | 
| 132 | 
            +
                    print(f"Gradient checkpointing: {self.use_gradient_checkpointing}")
         | 
| 133 | 
            +
                    print("=" * 50)
         | 
| 134 | 
            +
                    
         | 
| 135 | 
            +
                    # Set default experiment name if not provided
         | 
| 136 | 
            +
                    if self.experiment_name is None:
         | 
| 137 | 
            +
                        self.experiment_name = "smollm3_openhermes_fr_a100_large"
         | 
| 138 | 
            +
             | 
| 139 | 
            +
            def get_config(config_path: str) -> SmolLM3ConfigOpenHermesFRA100Large:
         | 
| 140 | 
            +
                """Load configuration from file or return default"""
         | 
| 141 | 
            +
                if os.path.exists(config_path):
         | 
| 142 | 
            +
                    # Load from file if it exists
         | 
| 143 | 
            +
                    import importlib.util
         | 
| 144 | 
            +
                    spec = importlib.util.spec_from_file_location("config_module", config_path)
         | 
| 145 | 
            +
                    config_module = importlib.util.module_from_spec(spec)
         | 
| 146 | 
            +
                    spec.loader.exec_module(config_module)
         | 
| 147 | 
            +
                    
         | 
| 148 | 
            +
                    if hasattr(config_module, 'config'):
         | 
| 149 | 
            +
                        return config_module.config
         | 
| 150 | 
            +
                    else:
         | 
| 151 | 
            +
                        # Try to find a config class
         | 
| 152 | 
            +
                        for attr_name in dir(config_module):
         | 
| 153 | 
            +
                            attr = getattr(config_module, attr_name)
         | 
| 154 | 
            +
                            if isinstance(attr, SmolLM3ConfigOpenHermesFRA100Large):
         | 
| 155 | 
            +
                                return attr
         | 
| 156 | 
            +
                
         | 
| 157 | 
            +
                # Return default configuration
         | 
| 158 | 
            +
                return SmolLM3ConfigOpenHermesFRA100Large()
         | 
| 159 | 
            +
             | 
| 160 | 
            +
            # Default configuration instance
         | 
| 161 | 
            +
            config = SmolLM3ConfigOpenHermesFRA100Large() 
         | 
    	
        config/train_smollm3_openhermes_fr_a100_multiple_passes.py
    ADDED
    
    | @@ -0,0 +1,164 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            SmolLM3 Training Configuration for OpenHermes-FR Dataset - Multiple Passes
         | 
| 3 | 
            +
            Optimized for A100 GPUs with multiple passes (3-5 epochs) on 800k+ datapoints
         | 
| 4 | 
            +
            """
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import os
         | 
| 7 | 
            +
            from dataclasses import dataclass
         | 
| 8 | 
            +
            from typing import Optional
         | 
| 9 | 
            +
            from config.train_smollm3 import SmolLM3Config
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            @dataclass
         | 
| 12 | 
            +
            class SmolLM3ConfigOpenHermesFRMultiplePasses(SmolLM3Config):
         | 
| 13 | 
            +
                """Configuration for SmolLM3 fine-tuning with multiple passes on OpenHermes-FR dataset"""
         | 
| 14 | 
            +
                
         | 
| 15 | 
            +
                # Model configuration - optimized for A100
         | 
| 16 | 
            +
                model_name: str = "HuggingFaceTB/SmolLM3-3B"
         | 
| 17 | 
            +
                max_seq_length: int = 8192  # Increased for better context understanding
         | 
| 18 | 
            +
                use_flash_attention: bool = True
         | 
| 19 | 
            +
                use_gradient_checkpointing: bool = False  # Disabled for A100 efficiency
         | 
| 20 | 
            +
                
         | 
| 21 | 
            +
                # Training configuration - Multiple passes optimized
         | 
| 22 | 
            +
                batch_size: int = 6  # Slightly smaller for stability during long training
         | 
| 23 | 
            +
                gradient_accumulation_steps: int = 20  # Effective batch size = 6 * 20 = 120
         | 
| 24 | 
            +
                learning_rate: float = 3e-6  # Conservative LR for multiple passes
         | 
| 25 | 
            +
                weight_decay: float = 0.01
         | 
| 26 | 
            +
                warmup_steps: int = 2000  # Longer warmup for multiple passes
         | 
| 27 | 
            +
                max_iters: int = 25000  # 4 passes on 800k dataset (25k steps)
         | 
| 28 | 
            +
                eval_interval: int = 1000  # Less frequent evaluation
         | 
| 29 | 
            +
                log_interval: int = 50  # Less frequent logging
         | 
| 30 | 
            +
                save_interval: int = 2000  # Less frequent saving
         | 
| 31 | 
            +
                
         | 
| 32 | 
            +
                # Optimizer configuration - stability focused
         | 
| 33 | 
            +
                optimizer: str = "adamw"
         | 
| 34 | 
            +
                beta1: float = 0.9
         | 
| 35 | 
            +
                beta2: float = 0.999  # Higher beta2 for stability
         | 
| 36 | 
            +
                eps: float = 1e-8
         | 
| 37 | 
            +
                
         | 
| 38 | 
            +
                # Scheduler configuration - longer training with multiple passes
         | 
| 39 | 
            +
                scheduler: str = "cosine"
         | 
| 40 | 
            +
                min_lr: float = 3e-7  # Lower min LR
         | 
| 41 | 
            +
                
         | 
| 42 | 
            +
                # Mixed precision - A100 optimized
         | 
| 43 | 
            +
                fp16: bool = False  # Use bf16 for A100
         | 
| 44 | 
            +
                bf16: bool = True  # Better for A100
         | 
| 45 | 
            +
                
         | 
| 46 | 
            +
                # DDP configuration
         | 
| 47 | 
            +
                ddp_backend: str = "nccl"
         | 
| 48 | 
            +
                ddp_find_unused_parameters: bool = False
         | 
| 49 | 
            +
                
         | 
| 50 | 
            +
                # Logging and saving - optimized for long training
         | 
| 51 | 
            +
                save_steps: int = 2000
         | 
| 52 | 
            +
                eval_steps: int = 1000
         | 
| 53 | 
            +
                logging_steps: int = 50
         | 
| 54 | 
            +
                save_total_limit: Optional[int] = 8  # Keep more checkpoints for long training
         | 
| 55 | 
            +
                
         | 
| 56 | 
            +
                # Evaluation
         | 
| 57 | 
            +
                eval_strategy: str = "steps"
         | 
| 58 | 
            +
                metric_for_best_model: str = "eval_loss"
         | 
| 59 | 
            +
                greater_is_better: bool = False
         | 
| 60 | 
            +
                load_best_model_at_end: bool = True
         | 
| 61 | 
            +
                
         | 
| 62 | 
            +
                # OpenHermes-FR Dataset configuration
         | 
| 63 | 
            +
                dataset_name: str = "legmlai/openhermes-fr"
         | 
| 64 | 
            +
                dataset_split: str = "train"
         | 
| 65 | 
            +
                input_field: str = "prompt"
         | 
| 66 | 
            +
                target_field: str = "accepted_completion"
         | 
| 67 | 
            +
                filter_bad_entries: bool = True
         | 
| 68 | 
            +
                bad_entry_field: str = "bad_entry"
         | 
| 69 | 
            +
                
         | 
| 70 | 
            +
                # Data configuration (not used for HF datasets but kept for compatibility)
         | 
| 71 | 
            +
                data_dir: str = None
         | 
| 72 | 
            +
                train_file: str = None
         | 
| 73 | 
            +
                validation_file: Optional[str] = None
         | 
| 74 | 
            +
                test_file: Optional[str] = None
         | 
| 75 | 
            +
                
         | 
| 76 | 
            +
                # Chat template configuration
         | 
| 77 | 
            +
                use_chat_template: bool = True
         | 
| 78 | 
            +
                chat_template_kwargs: dict = None
         | 
| 79 | 
            +
                
         | 
| 80 | 
            +
                # Trackio monitoring configuration
         | 
| 81 | 
            +
                enable_tracking: bool = True
         | 
| 82 | 
            +
                trackio_url: Optional[str] = None
         | 
| 83 | 
            +
                trackio_token: Optional[str] = None
         | 
| 84 | 
            +
                log_artifacts: bool = True
         | 
| 85 | 
            +
                log_metrics: bool = True
         | 
| 86 | 
            +
                log_config: bool = True
         | 
| 87 | 
            +
                experiment_name: Optional[str] = None
         | 
| 88 | 
            +
                
         | 
| 89 | 
            +
                # Additional A100 optimizations
         | 
| 90 | 
            +
                dataloader_num_workers: int = 8  # More workers for faster data loading
         | 
| 91 | 
            +
                dataloader_pin_memory: bool = True
         | 
| 92 | 
            +
                dataloader_prefetch_factor: int = 2
         | 
| 93 | 
            +
                
         | 
| 94 | 
            +
                # Memory optimizations
         | 
| 95 | 
            +
                max_grad_norm: float = 1.0  # Gradient clipping
         | 
| 96 | 
            +
                group_by_length: bool = True  # Group similar length sequences
         | 
| 97 | 
            +
                
         | 
| 98 | 
            +
                # Training duration calculations
         | 
| 99 | 
            +
                # With 800k datapoints and effective batch size of 120:
         | 
| 100 | 
            +
                # Steps per epoch = 800,000 / 120 = 6,667 steps
         | 
| 101 | 
            +
                # For 3 passes: 6,667 * 3 = 20,000 steps
         | 
| 102 | 
            +
                # For 4 passes: 6,667 * 4 = 26,667 steps
         | 
| 103 | 
            +
                # For 5 passes: 6,667 * 5 = 33,333 steps
         | 
| 104 | 
            +
                # Current max_iters = 25,000 (about 3.75 passes)
         | 
| 105 | 
            +
                
         | 
| 106 | 
            +
                def __post_init__(self):
         | 
| 107 | 
            +
                    if self.chat_template_kwargs is None:
         | 
| 108 | 
            +
                        self.chat_template_kwargs = {
         | 
| 109 | 
            +
                            "enable_thinking": False,
         | 
| 110 | 
            +
                            "add_generation_prompt": True
         | 
| 111 | 
            +
                        }
         | 
| 112 | 
            +
                    
         | 
| 113 | 
            +
                    # Validate configuration
         | 
| 114 | 
            +
                    if self.fp16 and self.bf16:
         | 
| 115 | 
            +
                        raise ValueError("Cannot use both fp16 and bf16")
         | 
| 116 | 
            +
                    
         | 
| 117 | 
            +
                    if self.max_seq_length > 131072:  # 128k limit
         | 
| 118 | 
            +
                        raise ValueError("max_seq_length cannot exceed 131072")
         | 
| 119 | 
            +
                    
         | 
| 120 | 
            +
                    # Calculate training statistics
         | 
| 121 | 
            +
                    effective_batch_size = self.batch_size * self.gradient_accumulation_steps
         | 
| 122 | 
            +
                    steps_per_epoch = 800000 // effective_batch_size  # Approximate for 800k dataset
         | 
| 123 | 
            +
                    epochs_for_max_iters = self.max_iters / steps_per_epoch
         | 
| 124 | 
            +
                    
         | 
| 125 | 
            +
                    print(f"=== Multiple Passes Training Configuration ===")
         | 
| 126 | 
            +
                    print(f"Effective batch size: {effective_batch_size}")
         | 
| 127 | 
            +
                    print(f"Steps per epoch: ~{steps_per_epoch}")
         | 
| 128 | 
            +
                    print(f"Training for ~{epochs_for_max_iters:.1f} epochs")
         | 
| 129 | 
            +
                    print(f"Total training steps: {self.max_iters}")
         | 
| 130 | 
            +
                    print(f"Learning rate: {self.learning_rate}")
         | 
| 131 | 
            +
                    print(f"Mixed precision: {'bf16' if self.bf16 else 'fp16'}")
         | 
| 132 | 
            +
                    print(f"Max sequence length: {self.max_seq_length}")
         | 
| 133 | 
            +
                    print(f"Gradient checkpointing: {self.use_gradient_checkpointing}")
         | 
| 134 | 
            +
                    print(f"Warmup steps: {self.warmup_steps}")
         | 
| 135 | 
            +
                    print(f"Save interval: {self.save_interval}")
         | 
| 136 | 
            +
                    print("=" * 50)
         | 
| 137 | 
            +
                    
         | 
| 138 | 
            +
                    # Set default experiment name if not provided
         | 
| 139 | 
            +
                    if self.experiment_name is None:
         | 
| 140 | 
            +
                        self.experiment_name = "smollm3_openhermes_fr_multiple_passes"
         | 
| 141 | 
            +
             | 
| 142 | 
            +
            def get_config(config_path: str) -> SmolLM3ConfigOpenHermesFRMultiplePasses:
         | 
| 143 | 
            +
                """Load configuration from file or return default"""
         | 
| 144 | 
            +
                if os.path.exists(config_path):
         | 
| 145 | 
            +
                    # Load from file if it exists
         | 
| 146 | 
            +
                    import importlib.util
         | 
| 147 | 
            +
                    spec = importlib.util.spec_from_file_location("config_module", config_path)
         | 
| 148 | 
            +
                    config_module = importlib.util.module_from_spec(spec)
         | 
| 149 | 
            +
                    spec.loader.exec_module(config_module)
         | 
| 150 | 
            +
                    
         | 
| 151 | 
            +
                    if hasattr(config_module, 'config'):
         | 
| 152 | 
            +
                        return config_module.config
         | 
| 153 | 
            +
                    else:
         | 
| 154 | 
            +
                        # Try to find a config class
         | 
| 155 | 
            +
                        for attr_name in dir(config_module):
         | 
| 156 | 
            +
                            attr = getattr(config_module, attr_name)
         | 
| 157 | 
            +
                            if isinstance(attr, SmolLM3ConfigOpenHermesFRMultiplePasses):
         | 
| 158 | 
            +
                                return attr
         | 
| 159 | 
            +
                
         | 
| 160 | 
            +
                # Return default configuration
         | 
| 161 | 
            +
                return SmolLM3ConfigOpenHermesFRMultiplePasses()
         | 
| 162 | 
            +
             | 
| 163 | 
            +
            # Default configuration instance
         | 
| 164 | 
            +
            config = SmolLM3ConfigOpenHermesFRMultiplePasses() 
         | 
    	
        data.py
    CHANGED
    
    | @@ -22,13 +22,17 @@ class SmolLM3Dataset: | |
| 22 | 
             
                    tokenizer: PreTrainedTokenizer,
         | 
| 23 | 
             
                    max_seq_length: int = 4096,
         | 
| 24 | 
             
                    use_chat_template: bool = True,
         | 
| 25 | 
            -
                    chat_template_kwargs: Optional[Dict] = None
         | 
|  | |
|  | |
| 26 | 
             
                ):
         | 
| 27 | 
             
                    self.data_path = data_path
         | 
| 28 | 
             
                    self.tokenizer = tokenizer
         | 
| 29 | 
             
                    self.max_seq_length = max_seq_length
         | 
| 30 | 
             
                    self.use_chat_template = use_chat_template
         | 
| 31 | 
             
                    self.chat_template_kwargs = chat_template_kwargs or {}
         | 
|  | |
|  | |
| 32 |  | 
| 33 | 
             
                    # Load and process dataset
         | 
| 34 | 
             
                    self.dataset = self._load_dataset()
         | 
| @@ -74,6 +78,17 @@ class SmolLM3Dataset: | |
| 74 | 
             
                    try:
         | 
| 75 | 
             
                        dataset = load_dataset(self.data_path)
         | 
| 76 | 
             
                        logger.info(f"Loaded Hugging Face dataset: {self.data_path}")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 77 | 
             
                        # If only 'train' split exists, create validation and test splits
         | 
| 78 | 
             
                        if ("train" in dataset) and ("validation" not in dataset or "test" not in dataset):
         | 
| 79 | 
             
                            logger.info("Automatically splitting train into train/validation/test (98/1/1)")
         | 
| @@ -123,6 +138,11 @@ class SmolLM3Dataset: | |
| 123 | 
             
                                        {"role": "user", "content": example["prompt"]},
         | 
| 124 | 
             
                                        {"role": "assistant", "content": example["accepted_completion"]}
         | 
| 125 | 
             
                                    ]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 126 | 
             
                                else:
         | 
| 127 | 
             
                                    # Fallback: treat as plain text
         | 
| 128 | 
             
                                    return {"text": str(example)}
         | 
|  | |
| 22 | 
             
                    tokenizer: PreTrainedTokenizer,
         | 
| 23 | 
             
                    max_seq_length: int = 4096,
         | 
| 24 | 
             
                    use_chat_template: bool = True,
         | 
| 25 | 
            +
                    chat_template_kwargs: Optional[Dict] = None,
         | 
| 26 | 
            +
                    filter_bad_entries: bool = False,
         | 
| 27 | 
            +
                    bad_entry_field: str = "bad_entry"
         | 
| 28 | 
             
                ):
         | 
| 29 | 
             
                    self.data_path = data_path
         | 
| 30 | 
             
                    self.tokenizer = tokenizer
         | 
| 31 | 
             
                    self.max_seq_length = max_seq_length
         | 
| 32 | 
             
                    self.use_chat_template = use_chat_template
         | 
| 33 | 
             
                    self.chat_template_kwargs = chat_template_kwargs or {}
         | 
| 34 | 
            +
                    self.filter_bad_entries = filter_bad_entries
         | 
| 35 | 
            +
                    self.bad_entry_field = bad_entry_field
         | 
| 36 |  | 
| 37 | 
             
                    # Load and process dataset
         | 
| 38 | 
             
                    self.dataset = self._load_dataset()
         | 
|  | |
| 78 | 
             
                    try:
         | 
| 79 | 
             
                        dataset = load_dataset(self.data_path)
         | 
| 80 | 
             
                        logger.info(f"Loaded Hugging Face dataset: {self.data_path}")
         | 
| 81 | 
            +
                        
         | 
| 82 | 
            +
                        # Filter bad entries if requested
         | 
| 83 | 
            +
                        if self.filter_bad_entries and self.bad_entry_field in dataset["train"].column_names:
         | 
| 84 | 
            +
                            logger.info(f"Filtering out bad entries using field: {self.bad_entry_field}")
         | 
| 85 | 
            +
                            for split in dataset:
         | 
| 86 | 
            +
                                if self.bad_entry_field in dataset[split].column_names:
         | 
| 87 | 
            +
                                    original_size = len(dataset[split])
         | 
| 88 | 
            +
                                    dataset[split] = dataset[split].filter(lambda x: not x[self.bad_entry_field])
         | 
| 89 | 
            +
                                    filtered_size = len(dataset[split])
         | 
| 90 | 
            +
                                    logger.info(f"Filtered {split}: {original_size} -> {filtered_size} samples")
         | 
| 91 | 
            +
                        
         | 
| 92 | 
             
                        # If only 'train' split exists, create validation and test splits
         | 
| 93 | 
             
                        if ("train" in dataset) and ("validation" not in dataset or "test" not in dataset):
         | 
| 94 | 
             
                            logger.info("Automatically splitting train into train/validation/test (98/1/1)")
         | 
|  | |
| 138 | 
             
                                        {"role": "user", "content": example["prompt"]},
         | 
| 139 | 
             
                                        {"role": "assistant", "content": example["accepted_completion"]}
         | 
| 140 | 
             
                                    ]
         | 
| 141 | 
            +
                                elif "prompt" in example and "completion" in example:
         | 
| 142 | 
            +
                                    messages = [
         | 
| 143 | 
            +
                                        {"role": "user", "content": example["prompt"]},
         | 
| 144 | 
            +
                                        {"role": "assistant", "content": example["completion"]}
         | 
| 145 | 
            +
                                    ]
         | 
| 146 | 
             
                                else:
         | 
| 147 | 
             
                                    # Fallback: treat as plain text
         | 
| 148 | 
             
                                    return {"text": str(example)}
         | 
    	
        deploy_trackio_space.py
    ADDED
    
    | @@ -0,0 +1,235 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            """
         | 
| 3 | 
            +
            Deployment script for Trackio on Hugging Face Spaces
         | 
| 4 | 
            +
            Automates the process of creating and configuring a Trackio Space
         | 
| 5 | 
            +
            """
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            import os
         | 
| 8 | 
            +
            import json
         | 
| 9 | 
            +
            import requests
         | 
| 10 | 
            +
            import subprocess
         | 
| 11 | 
            +
            import sys
         | 
| 12 | 
            +
            from pathlib import Path
         | 
| 13 | 
            +
            from typing import Dict, Any, Optional
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            class TrackioSpaceDeployer:
         | 
| 16 | 
            +
                """Deployer for Trackio on Hugging Face Spaces"""
         | 
| 17 | 
            +
                
         | 
| 18 | 
            +
                def __init__(self, space_name: str, username: str, token: str):
         | 
| 19 | 
            +
                    self.space_name = space_name
         | 
| 20 | 
            +
                    self.username = username
         | 
| 21 | 
            +
                    self.token = token
         | 
| 22 | 
            +
                    self.space_url = f"https://huggingface.co/spaces/{username}/{space_name}"
         | 
| 23 | 
            +
                    
         | 
| 24 | 
            +
                def create_space(self) -> bool:
         | 
| 25 | 
            +
                    """Create a new Hugging Face Space"""
         | 
| 26 | 
            +
                    try:
         | 
| 27 | 
            +
                        print(f"Creating Space: {self.space_name}")
         | 
| 28 | 
            +
                        
         | 
| 29 | 
            +
                        # Create space using Hugging Face CLI
         | 
| 30 | 
            +
                        cmd = [
         | 
| 31 | 
            +
                            "huggingface-cli", "repo", "create",
         | 
| 32 | 
            +
                            f"{self.username}/{self.space_name}",
         | 
| 33 | 
            +
                            "--type", "space",
         | 
| 34 | 
            +
                            "--space-sdk", "gradio",
         | 
| 35 | 
            +
                            "--space-hardware", "cpu-basic"
         | 
| 36 | 
            +
                        ]
         | 
| 37 | 
            +
                        
         | 
| 38 | 
            +
                        result = subprocess.run(cmd, capture_output=True, text=True)
         | 
| 39 | 
            +
                        
         | 
| 40 | 
            +
                        if result.returncode == 0:
         | 
| 41 | 
            +
                            print(f"✅ Space created successfully: {self.space_url}")
         | 
| 42 | 
            +
                            return True
         | 
| 43 | 
            +
                        else:
         | 
| 44 | 
            +
                            print(f"❌ Failed to create space: {result.stderr}")
         | 
| 45 | 
            +
                            return False
         | 
| 46 | 
            +
                            
         | 
| 47 | 
            +
                    except Exception as e:
         | 
| 48 | 
            +
                        print(f"❌ Error creating space: {e}")
         | 
| 49 | 
            +
                        return False
         | 
| 50 | 
            +
                
         | 
| 51 | 
            +
                def upload_files(self) -> bool:
         | 
| 52 | 
            +
                    """Upload necessary files to the Space"""
         | 
| 53 | 
            +
                    try:
         | 
| 54 | 
            +
                        print("Uploading files to Space...")
         | 
| 55 | 
            +
                        
         | 
| 56 | 
            +
                        # Files to upload
         | 
| 57 | 
            +
                        files_to_upload = [
         | 
| 58 | 
            +
                            "app.py",
         | 
| 59 | 
            +
                            "requirements_space.txt",
         | 
| 60 | 
            +
                            "README.md"
         | 
| 61 | 
            +
                        ]
         | 
| 62 | 
            +
                        
         | 
| 63 | 
            +
                        for file_path in files_to_upload:
         | 
| 64 | 
            +
                            if os.path.exists(file_path):
         | 
| 65 | 
            +
                                # Use git to add and push files
         | 
| 66 | 
            +
                                subprocess.run(["git", "add", file_path], check=True)
         | 
| 67 | 
            +
                                subprocess.run(["git", "commit", "-m", f"Add {file_path}"], check=True)
         | 
| 68 | 
            +
                                subprocess.run(["git", "push"], check=True)
         | 
| 69 | 
            +
                                print(f"✅ Uploaded {file_path}")
         | 
| 70 | 
            +
                            else:
         | 
| 71 | 
            +
                                print(f"⚠️  File not found: {file_path}")
         | 
| 72 | 
            +
                        
         | 
| 73 | 
            +
                        return True
         | 
| 74 | 
            +
                        
         | 
| 75 | 
            +
                    except Exception as e:
         | 
| 76 | 
            +
                        print(f"❌ Error uploading files: {e}")
         | 
| 77 | 
            +
                        return False
         | 
| 78 | 
            +
                
         | 
| 79 | 
            +
                def configure_space(self) -> bool:
         | 
| 80 | 
            +
                    """Configure the Space settings"""
         | 
| 81 | 
            +
                    try:
         | 
| 82 | 
            +
                        print("Configuring Space settings...")
         | 
| 83 | 
            +
                        
         | 
| 84 | 
            +
                        # Create space configuration
         | 
| 85 | 
            +
                        space_config = {
         | 
| 86 | 
            +
                            "title": "Trackio - Experiment Tracking",
         | 
| 87 | 
            +
                            "emoji": "🚀",
         | 
| 88 | 
            +
                            "colorFrom": "blue",
         | 
| 89 | 
            +
                            "colorTo": "purple",
         | 
| 90 | 
            +
                            "sdk": "gradio",
         | 
| 91 | 
            +
                            "sdk_version": "4.0.0",
         | 
| 92 | 
            +
                            "app_file": "app.py",
         | 
| 93 | 
            +
                            "pinned": False
         | 
| 94 | 
            +
                        }
         | 
| 95 | 
            +
                        
         | 
| 96 | 
            +
                        # Write README.md for the space
         | 
| 97 | 
            +
                        space_readme = f"""---
         | 
| 98 | 
            +
            title: Trackio for Petite Elle L'Aime
         | 
| 99 | 
            +
            emoji: 🐠
         | 
| 100 | 
            +
            colorFrom: indigo
         | 
| 101 | 
            +
            colorTo: yellow
         | 
| 102 | 
            +
            sdk: gradio
         | 
| 103 | 
            +
            sdk_version: 5.38.0
         | 
| 104 | 
            +
            app_file: app.py
         | 
| 105 | 
            +
            pinned: true
         | 
| 106 | 
            +
            license: mit
         | 
| 107 | 
            +
            short_description: trackio for training monitoring
         | 
| 108 | 
            +
            ---
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            # Trackio Experiment Tracking
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            A Gradio interface for experiment tracking and monitoring.
         | 
| 113 | 
            +
             | 
| 114 | 
            +
            ## Features
         | 
| 115 | 
            +
             | 
| 116 | 
            +
            - Create and manage experiments
         | 
| 117 | 
            +
            - Log training metrics and parameters
         | 
| 118 | 
            +
            - View experiment details and results
         | 
| 119 | 
            +
            - Update experiment status
         | 
| 120 | 
            +
             | 
| 121 | 
            +
            ## Usage
         | 
| 122 | 
            +
             | 
| 123 | 
            +
            1. Create a new experiment using the "Create Experiment" tab
         | 
| 124 | 
            +
            2. Log metrics during training using the "Log Metrics" tab
         | 
| 125 | 
            +
            3. View experiment details using the "View Experiments" tab
         | 
| 126 | 
            +
            4. Update experiment status using the "Update Status" tab
         | 
| 127 | 
            +
             | 
| 128 | 
            +
            ## Integration
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            To connect your training script to this Trackio Space:
         | 
| 131 | 
            +
             | 
| 132 | 
            +
            ```python
         | 
| 133 | 
            +
            from monitoring import SmolLM3Monitor
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            monitor = SmolLM3Monitor(
         | 
| 136 | 
            +
                experiment_name="my_experiment",
         | 
| 137 | 
            +
                trackio_url="{self.space_url}",
         | 
| 138 | 
            +
                enable_tracking=True
         | 
| 139 | 
            +
            )
         | 
| 140 | 
            +
            ```
         | 
| 141 | 
            +
             | 
| 142 | 
            +
            Visit: {self.space_url}
         | 
| 143 | 
            +
            """
         | 
| 144 | 
            +
                        
         | 
| 145 | 
            +
                        with open("README.md", "w") as f:
         | 
| 146 | 
            +
                            f.write(space_readme)
         | 
| 147 | 
            +
                        
         | 
| 148 | 
            +
                        return True
         | 
| 149 | 
            +
                        
         | 
| 150 | 
            +
                    except Exception as e:
         | 
| 151 | 
            +
                        print(f"❌ Error configuring space: {e}")
         | 
| 152 | 
            +
                        return False
         | 
| 153 | 
            +
                
         | 
| 154 | 
            +
                def test_space(self) -> bool:
         | 
| 155 | 
            +
                    """Test if the Space is working correctly"""
         | 
| 156 | 
            +
                    try:
         | 
| 157 | 
            +
                        print("Testing Space...")
         | 
| 158 | 
            +
                        
         | 
| 159 | 
            +
                        # Wait a bit for the space to build
         | 
| 160 | 
            +
                        import time
         | 
| 161 | 
            +
                        time.sleep(30)
         | 
| 162 | 
            +
                        
         | 
| 163 | 
            +
                        # Try to access the space
         | 
| 164 | 
            +
                        response = requests.get(self.space_url, timeout=10)
         | 
| 165 | 
            +
                        
         | 
| 166 | 
            +
                        if response.status_code == 200:
         | 
| 167 | 
            +
                            print(f"✅ Space is accessible: {self.space_url}")
         | 
| 168 | 
            +
                            return True
         | 
| 169 | 
            +
                        else:
         | 
| 170 | 
            +
                            print(f"⚠️  Space returned status code: {response.status_code}")
         | 
| 171 | 
            +
                            return False
         | 
| 172 | 
            +
                            
         | 
| 173 | 
            +
                    except Exception as e:
         | 
| 174 | 
            +
                        print(f"❌ Error testing space: {e}")
         | 
| 175 | 
            +
                        return False
         | 
| 176 | 
            +
                
         | 
| 177 | 
            +
                def deploy(self) -> bool:
         | 
| 178 | 
            +
                    """Complete deployment process"""
         | 
| 179 | 
            +
                    print("🚀 Starting Trackio Space deployment...")
         | 
| 180 | 
            +
                    
         | 
| 181 | 
            +
                    # Step 1: Create space
         | 
| 182 | 
            +
                    if not self.create_space():
         | 
| 183 | 
            +
                        return False
         | 
| 184 | 
            +
                    
         | 
| 185 | 
            +
                    # Step 2: Configure space
         | 
| 186 | 
            +
                    if not self.configure_space():
         | 
| 187 | 
            +
                        return False
         | 
| 188 | 
            +
                    
         | 
| 189 | 
            +
                    # Step 3: Upload files
         | 
| 190 | 
            +
                    if not self.upload_files():
         | 
| 191 | 
            +
                        return False
         | 
| 192 | 
            +
                    
         | 
| 193 | 
            +
                    # Step 4: Test space
         | 
| 194 | 
            +
                    if not self.test_space():
         | 
| 195 | 
            +
                        print("⚠️  Space created but may need time to build")
         | 
| 196 | 
            +
                    
         | 
| 197 | 
            +
                    print(f"🎉 Deployment completed!")
         | 
| 198 | 
            +
                    print(f"📊 Trackio Space URL: {self.space_url}")
         | 
| 199 | 
            +
                    print(f"🔧 Space configuration: {self.space_url}/settings")
         | 
| 200 | 
            +
                    
         | 
| 201 | 
            +
                    return True
         | 
| 202 | 
            +
             | 
| 203 | 
            +
            def main():
         | 
| 204 | 
            +
                """Main deployment function"""
         | 
| 205 | 
            +
                print("Trackio Space Deployment Script")
         | 
| 206 | 
            +
                print("=" * 40)
         | 
| 207 | 
            +
                
         | 
| 208 | 
            +
                # Get user input
         | 
| 209 | 
            +
                username = input("Enter your Hugging Face username: ").strip()
         | 
| 210 | 
            +
                space_name = input("Enter Space name (e.g., trackio-monitoring): ").strip()
         | 
| 211 | 
            +
                token = input("Enter your Hugging Face token (optional): ").strip()
         | 
| 212 | 
            +
                
         | 
| 213 | 
            +
                if not username or not space_name:
         | 
| 214 | 
            +
                    print("❌ Username and Space name are required")
         | 
| 215 | 
            +
                    sys.exit(1)
         | 
| 216 | 
            +
                
         | 
| 217 | 
            +
                # Create deployer
         | 
| 218 | 
            +
                deployer = TrackioSpaceDeployer(space_name, username, token)
         | 
| 219 | 
            +
                
         | 
| 220 | 
            +
                # Run deployment
         | 
| 221 | 
            +
                success = deployer.deploy()
         | 
| 222 | 
            +
                
         | 
| 223 | 
            +
                if success:
         | 
| 224 | 
            +
                    print("\n✅ Deployment successful!")
         | 
| 225 | 
            +
                    print(f"🌐 Your Trackio Space: {deployer.space_url}")
         | 
| 226 | 
            +
                    print("\nNext steps:")
         | 
| 227 | 
            +
                    print("1. Wait for the Space to build (usually 2-5 minutes)")
         | 
| 228 | 
            +
                    print("2. Test the interface by visiting the Space URL")
         | 
| 229 | 
            +
                    print("3. Use the Space URL in your training scripts")
         | 
| 230 | 
            +
                else:
         | 
| 231 | 
            +
                    print("\n❌ Deployment failed!")
         | 
| 232 | 
            +
                    print("Check the error messages above and try again.")
         | 
| 233 | 
            +
             | 
| 234 | 
            +
            if __name__ == "__main__":
         | 
| 235 | 
            +
                main() 
         | 
    	
        monitoring.py
    ADDED
    
    | @@ -0,0 +1,298 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            Trackio Monitoring Integration for SmolLM3 Fine-tuning
         | 
| 3 | 
            +
            Provides comprehensive experiment tracking and monitoring capabilities
         | 
| 4 | 
            +
            """
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import os
         | 
| 7 | 
            +
            import json
         | 
| 8 | 
            +
            import logging
         | 
| 9 | 
            +
            from typing import Dict, Any, Optional, List
         | 
| 10 | 
            +
            from datetime import datetime
         | 
| 11 | 
            +
            import torch
         | 
| 12 | 
            +
            from pathlib import Path
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            try:
         | 
| 15 | 
            +
                import trackio
         | 
| 16 | 
            +
                from trackio import TrackioClient
         | 
| 17 | 
            +
                TRACKIO_AVAILABLE = True
         | 
| 18 | 
            +
            except ImportError:
         | 
| 19 | 
            +
                TRACKIO_AVAILABLE = False
         | 
| 20 | 
            +
                print("Warning: Trackio not available. Install with: pip install trackio")
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            logger = logging.getLogger(__name__)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            class SmolLM3Monitor:
         | 
| 25 | 
            +
                """Monitoring and tracking for SmolLM3 fine-tuning experiments"""
         | 
| 26 | 
            +
                
         | 
| 27 | 
            +
                def __init__(
         | 
| 28 | 
            +
                    self,
         | 
| 29 | 
            +
                    experiment_name: str,
         | 
| 30 | 
            +
                    trackio_url: Optional[str] = None,
         | 
| 31 | 
            +
                    trackio_token: Optional[str] = None,
         | 
| 32 | 
            +
                    enable_tracking: bool = True,
         | 
| 33 | 
            +
                    log_artifacts: bool = True,
         | 
| 34 | 
            +
                    log_metrics: bool = True,
         | 
| 35 | 
            +
                    log_config: bool = True
         | 
| 36 | 
            +
                ):
         | 
| 37 | 
            +
                    self.experiment_name = experiment_name
         | 
| 38 | 
            +
                    self.enable_tracking = enable_tracking and TRACKIO_AVAILABLE
         | 
| 39 | 
            +
                    self.log_artifacts = log_artifacts
         | 
| 40 | 
            +
                    self.log_metrics = log_metrics
         | 
| 41 | 
            +
                    self.log_config = log_config
         | 
| 42 | 
            +
                    
         | 
| 43 | 
            +
                    # Initialize Trackio client
         | 
| 44 | 
            +
                    self.trackio_client = None
         | 
| 45 | 
            +
                    if self.enable_tracking:
         | 
| 46 | 
            +
                        self._setup_trackio(trackio_url, trackio_token)
         | 
| 47 | 
            +
                    
         | 
| 48 | 
            +
                    # Experiment metadata
         | 
| 49 | 
            +
                    self.experiment_id = None
         | 
| 50 | 
            +
                    self.start_time = datetime.now()
         | 
| 51 | 
            +
                    self.metrics_history = []
         | 
| 52 | 
            +
                    self.artifacts = []
         | 
| 53 | 
            +
                    
         | 
| 54 | 
            +
                    logger.info(f"Initialized monitoring for experiment: {experiment_name}")
         | 
| 55 | 
            +
                
         | 
| 56 | 
            +
                def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
         | 
| 57 | 
            +
                    """Setup Trackio client"""
         | 
| 58 | 
            +
                    try:
         | 
| 59 | 
            +
                        # Get Trackio configuration from environment or parameters
         | 
| 60 | 
            +
                        url = trackio_url or os.getenv('TRACKIO_URL')
         | 
| 61 | 
            +
                        token = trackio_token or os.getenv('TRACKIO_TOKEN')
         | 
| 62 | 
            +
                        
         | 
| 63 | 
            +
                        if not url:
         | 
| 64 | 
            +
                            logger.warning("Trackio URL not provided. Set TRACKIO_URL environment variable.")
         | 
| 65 | 
            +
                            self.enable_tracking = False
         | 
| 66 | 
            +
                            return
         | 
| 67 | 
            +
                        
         | 
| 68 | 
            +
                        self.trackio_client = TrackioClient(
         | 
| 69 | 
            +
                            url=url,
         | 
| 70 | 
            +
                            token=token
         | 
| 71 | 
            +
                        )
         | 
| 72 | 
            +
                        
         | 
| 73 | 
            +
                        # Create or get experiment
         | 
| 74 | 
            +
                        self.experiment_id = self.trackio_client.create_experiment(
         | 
| 75 | 
            +
                            name=self.experiment_name,
         | 
| 76 | 
            +
                            description=f"SmolLM3 fine-tuning experiment started at {self.start_time}"
         | 
| 77 | 
            +
                        )
         | 
| 78 | 
            +
                        
         | 
| 79 | 
            +
                        logger.info(f"Trackio client initialized. Experiment ID: {self.experiment_id}")
         | 
| 80 | 
            +
                        
         | 
| 81 | 
            +
                    except Exception as e:
         | 
| 82 | 
            +
                        logger.error(f"Failed to initialize Trackio: {e}")
         | 
| 83 | 
            +
                        self.enable_tracking = False
         | 
| 84 | 
            +
                
         | 
| 85 | 
            +
                def log_config(self, config: Dict[str, Any]):
         | 
| 86 | 
            +
                    """Log experiment configuration"""
         | 
| 87 | 
            +
                    if not self.enable_tracking or not self.log_config:
         | 
| 88 | 
            +
                        return
         | 
| 89 | 
            +
                    
         | 
| 90 | 
            +
                    try:
         | 
| 91 | 
            +
                        # Log configuration as parameters
         | 
| 92 | 
            +
                        self.trackio_client.log_parameters(
         | 
| 93 | 
            +
                            experiment_id=self.experiment_id,
         | 
| 94 | 
            +
                            parameters=config
         | 
| 95 | 
            +
                        )
         | 
| 96 | 
            +
                        
         | 
| 97 | 
            +
                        # Also save config locally
         | 
| 98 | 
            +
                        config_path = f"config_{self.experiment_name}_{self.start_time.strftime('%Y%m%d_%H%M%S')}.json"
         | 
| 99 | 
            +
                        with open(config_path, 'w') as f:
         | 
| 100 | 
            +
                            json.dump(config, f, indent=2, default=str)
         | 
| 101 | 
            +
                        
         | 
| 102 | 
            +
                        self.artifacts.append(config_path)
         | 
| 103 | 
            +
                        logger.info(f"Configuration logged to Trackio and saved to {config_path}")
         | 
| 104 | 
            +
                        
         | 
| 105 | 
            +
                    except Exception as e:
         | 
| 106 | 
            +
                        logger.error(f"Failed to log configuration: {e}")
         | 
| 107 | 
            +
                
         | 
| 108 | 
            +
                def log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None):
         | 
| 109 | 
            +
                    """Log training metrics"""
         | 
| 110 | 
            +
                    if not self.enable_tracking or not self.log_metrics:
         | 
| 111 | 
            +
                        return
         | 
| 112 | 
            +
                    
         | 
| 113 | 
            +
                    try:
         | 
| 114 | 
            +
                        # Add timestamp
         | 
| 115 | 
            +
                        metrics['timestamp'] = datetime.now().isoformat()
         | 
| 116 | 
            +
                        if step is not None:
         | 
| 117 | 
            +
                            metrics['step'] = step
         | 
| 118 | 
            +
                        
         | 
| 119 | 
            +
                        # Log to Trackio
         | 
| 120 | 
            +
                        self.trackio_client.log_metrics(
         | 
| 121 | 
            +
                            experiment_id=self.experiment_id,
         | 
| 122 | 
            +
                            metrics=metrics,
         | 
| 123 | 
            +
                            step=step
         | 
| 124 | 
            +
                        )
         | 
| 125 | 
            +
                        
         | 
| 126 | 
            +
                        # Store locally
         | 
| 127 | 
            +
                        self.metrics_history.append(metrics)
         | 
| 128 | 
            +
                        
         | 
| 129 | 
            +
                        logger.debug(f"Metrics logged: {metrics}")
         | 
| 130 | 
            +
                        
         | 
| 131 | 
            +
                    except Exception as e:
         | 
| 132 | 
            +
                        logger.error(f"Failed to log metrics: {e}")
         | 
| 133 | 
            +
                
         | 
| 134 | 
            +
                def log_model_checkpoint(self, checkpoint_path: str, step: Optional[int] = None):
         | 
| 135 | 
            +
                    """Log model checkpoint"""
         | 
| 136 | 
            +
                    if not self.enable_tracking or not self.log_artifacts:
         | 
| 137 | 
            +
                        return
         | 
| 138 | 
            +
                    
         | 
| 139 | 
            +
                    try:
         | 
| 140 | 
            +
                        # Log checkpoint as artifact
         | 
| 141 | 
            +
                        self.trackio_client.log_artifact(
         | 
| 142 | 
            +
                            experiment_id=self.experiment_id,
         | 
| 143 | 
            +
                            file_path=checkpoint_path,
         | 
| 144 | 
            +
                            artifact_name=f"checkpoint_step_{step}" if step else "checkpoint"
         | 
| 145 | 
            +
                        )
         | 
| 146 | 
            +
                        
         | 
| 147 | 
            +
                        self.artifacts.append(checkpoint_path)
         | 
| 148 | 
            +
                        logger.info(f"Checkpoint logged: {checkpoint_path}")
         | 
| 149 | 
            +
                        
         | 
| 150 | 
            +
                    except Exception as e:
         | 
| 151 | 
            +
                        logger.error(f"Failed to log checkpoint: {e}")
         | 
| 152 | 
            +
                
         | 
| 153 | 
            +
                def log_evaluation_results(self, results: Dict[str, Any], step: Optional[int] = None):
         | 
| 154 | 
            +
                    """Log evaluation results"""
         | 
| 155 | 
            +
                    if not self.enable_tracking:
         | 
| 156 | 
            +
                        return
         | 
| 157 | 
            +
                    
         | 
| 158 | 
            +
                    try:
         | 
| 159 | 
            +
                        # Add evaluation prefix to metrics
         | 
| 160 | 
            +
                        eval_metrics = {f"eval_{k}": v for k, v in results.items()}
         | 
| 161 | 
            +
                        
         | 
| 162 | 
            +
                        self.log_metrics(eval_metrics, step)
         | 
| 163 | 
            +
                        
         | 
| 164 | 
            +
                        # Save evaluation results locally
         | 
| 165 | 
            +
                        eval_path = f"eval_results_step_{step}_{self.start_time.strftime('%Y%m%d_%H%M%S')}.json"
         | 
| 166 | 
            +
                        with open(eval_path, 'w') as f:
         | 
| 167 | 
            +
                            json.dump(results, f, indent=2, default=str)
         | 
| 168 | 
            +
                        
         | 
| 169 | 
            +
                        self.artifacts.append(eval_path)
         | 
| 170 | 
            +
                        logger.info(f"Evaluation results logged and saved to {eval_path}")
         | 
| 171 | 
            +
                        
         | 
| 172 | 
            +
                    except Exception as e:
         | 
| 173 | 
            +
                        logger.error(f"Failed to log evaluation results: {e}")
         | 
| 174 | 
            +
                
         | 
| 175 | 
            +
                def log_system_metrics(self, step: Optional[int] = None):
         | 
| 176 | 
            +
                    """Log system metrics (GPU, memory, etc.)"""
         | 
| 177 | 
            +
                    if not self.enable_tracking:
         | 
| 178 | 
            +
                        return
         | 
| 179 | 
            +
                    
         | 
| 180 | 
            +
                    try:
         | 
| 181 | 
            +
                        system_metrics = {}
         | 
| 182 | 
            +
                        
         | 
| 183 | 
            +
                        # GPU metrics
         | 
| 184 | 
            +
                        if torch.cuda.is_available():
         | 
| 185 | 
            +
                            for i in range(torch.cuda.device_count()):
         | 
| 186 | 
            +
                                system_metrics[f'gpu_{i}_memory_allocated'] = torch.cuda.memory_allocated(i) / 1024**3  # GB
         | 
| 187 | 
            +
                                system_metrics[f'gpu_{i}_memory_reserved'] = torch.cuda.memory_reserved(i) / 1024**3  # GB
         | 
| 188 | 
            +
                                system_metrics[f'gpu_{i}_utilization'] = torch.cuda.utilization(i) if hasattr(torch.cuda, 'utilization') else 0
         | 
| 189 | 
            +
                        
         | 
| 190 | 
            +
                        # CPU and memory metrics (basic)
         | 
| 191 | 
            +
                        import psutil
         | 
| 192 | 
            +
                        system_metrics['cpu_percent'] = psutil.cpu_percent()
         | 
| 193 | 
            +
                        system_metrics['memory_percent'] = psutil.virtual_memory().percent
         | 
| 194 | 
            +
                        
         | 
| 195 | 
            +
                        self.log_metrics(system_metrics, step)
         | 
| 196 | 
            +
                        
         | 
| 197 | 
            +
                    except Exception as e:
         | 
| 198 | 
            +
                        logger.error(f"Failed to log system metrics: {e}")
         | 
| 199 | 
            +
                
         | 
| 200 | 
            +
                def log_training_summary(self, summary: Dict[str, Any]):
         | 
| 201 | 
            +
                    """Log training summary at the end"""
         | 
| 202 | 
            +
                    if not self.enable_tracking:
         | 
| 203 | 
            +
                        return
         | 
| 204 | 
            +
                    
         | 
| 205 | 
            +
                    try:
         | 
| 206 | 
            +
                        # Add experiment duration
         | 
| 207 | 
            +
                        end_time = datetime.now()
         | 
| 208 | 
            +
                        duration = (end_time - self.start_time).total_seconds()
         | 
| 209 | 
            +
                        summary['experiment_duration_seconds'] = duration
         | 
| 210 | 
            +
                        summary['experiment_duration_hours'] = duration / 3600
         | 
| 211 | 
            +
                        
         | 
| 212 | 
            +
                        # Log final summary
         | 
| 213 | 
            +
                        self.trackio_client.log_parameters(
         | 
| 214 | 
            +
                            experiment_id=self.experiment_id,
         | 
| 215 | 
            +
                            parameters=summary
         | 
| 216 | 
            +
                        )
         | 
| 217 | 
            +
                        
         | 
| 218 | 
            +
                        # Save summary locally
         | 
| 219 | 
            +
                        summary_path = f"training_summary_{self.experiment_name}_{self.start_time.strftime('%Y%m%d_%H%M%S')}.json"
         | 
| 220 | 
            +
                        with open(summary_path, 'w') as f:
         | 
| 221 | 
            +
                            json.dump(summary, f, indent=2, default=str)
         | 
| 222 | 
            +
                        
         | 
| 223 | 
            +
                        self.artifacts.append(summary_path)
         | 
| 224 | 
            +
                        logger.info(f"Training summary logged and saved to {summary_path}")
         | 
| 225 | 
            +
                        
         | 
| 226 | 
            +
                    except Exception as e:
         | 
| 227 | 
            +
                        logger.error(f"Failed to log training summary: {e}")
         | 
| 228 | 
            +
                
         | 
| 229 | 
            +
                def create_monitoring_callback(self):
         | 
| 230 | 
            +
                    """Create a callback for integration with Hugging Face Trainer"""
         | 
| 231 | 
            +
                    if not self.enable_tracking:
         | 
| 232 | 
            +
                        return None
         | 
| 233 | 
            +
                    
         | 
| 234 | 
            +
                    class TrackioCallback:
         | 
| 235 | 
            +
                        def __init__(self, monitor):
         | 
| 236 | 
            +
                            self.monitor = monitor
         | 
| 237 | 
            +
                        
         | 
| 238 | 
            +
                        def on_log(self, args, state, control, logs=None, **kwargs):
         | 
| 239 | 
            +
                            """Called when logs are created"""
         | 
| 240 | 
            +
                            if logs:
         | 
| 241 | 
            +
                                self.monitor.log_metrics(logs, state.global_step)
         | 
| 242 | 
            +
                                self.monitor.log_system_metrics(state.global_step)
         | 
| 243 | 
            +
                        
         | 
| 244 | 
            +
                        def on_save(self, args, state, control, **kwargs):
         | 
| 245 | 
            +
                            """Called when a checkpoint is saved"""
         | 
| 246 | 
            +
                            checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
         | 
| 247 | 
            +
                            if os.path.exists(checkpoint_path):
         | 
| 248 | 
            +
                                self.monitor.log_model_checkpoint(checkpoint_path, state.global_step)
         | 
| 249 | 
            +
                        
         | 
| 250 | 
            +
                        def on_evaluate(self, args, state, control, metrics=None, **kwargs):
         | 
| 251 | 
            +
                            """Called when evaluation is performed"""
         | 
| 252 | 
            +
                            if metrics:
         | 
| 253 | 
            +
                                self.monitor.log_evaluation_results(metrics, state.global_step)
         | 
| 254 | 
            +
                    
         | 
| 255 | 
            +
                    return TrackioCallback(self)
         | 
| 256 | 
            +
                
         | 
| 257 | 
            +
                def get_experiment_url(self) -> Optional[str]:
         | 
| 258 | 
            +
                    """Get the URL to view the experiment in Trackio"""
         | 
| 259 | 
            +
                    if self.trackio_client and self.experiment_id:
         | 
| 260 | 
            +
                        return f"{self.trackio_client.url}/experiments/{self.experiment_id}"
         | 
| 261 | 
            +
                    return None
         | 
| 262 | 
            +
                
         | 
| 263 | 
            +
                def close(self):
         | 
| 264 | 
            +
                    """Close the monitoring session"""
         | 
| 265 | 
            +
                    if self.enable_tracking and self.trackio_client:
         | 
| 266 | 
            +
                        try:
         | 
| 267 | 
            +
                            # Mark experiment as completed
         | 
| 268 | 
            +
                            self.trackio_client.update_experiment_status(
         | 
| 269 | 
            +
                                experiment_id=self.experiment_id,
         | 
| 270 | 
            +
                                status="completed"
         | 
| 271 | 
            +
                            )
         | 
| 272 | 
            +
                            logger.info("Monitoring session closed")
         | 
| 273 | 
            +
                        except Exception as e:
         | 
| 274 | 
            +
                            logger.error(f"Failed to close monitoring session: {e}")
         | 
| 275 | 
            +
             | 
| 276 | 
            +
            # Utility function to create monitor from config
         | 
| 277 | 
            +
            def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
         | 
| 278 | 
            +
                """Create a monitor instance from configuration"""
         | 
| 279 | 
            +
                if experiment_name is None:
         | 
| 280 | 
            +
                    experiment_name = f"smollm3_finetune_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         | 
| 281 | 
            +
                
         | 
| 282 | 
            +
                # Extract monitoring configuration
         | 
| 283 | 
            +
                trackio_url = getattr(config, 'trackio_url', None)
         | 
| 284 | 
            +
                trackio_token = getattr(config, 'trackio_token', None)
         | 
| 285 | 
            +
                enable_tracking = getattr(config, 'enable_tracking', True)
         | 
| 286 | 
            +
                log_artifacts = getattr(config, 'log_artifacts', True)
         | 
| 287 | 
            +
                log_metrics = getattr(config, 'log_metrics', True)
         | 
| 288 | 
            +
                log_config = getattr(config, 'log_config', True)
         | 
| 289 | 
            +
                
         | 
| 290 | 
            +
                return SmolLM3Monitor(
         | 
| 291 | 
            +
                    experiment_name=experiment_name,
         | 
| 292 | 
            +
                    trackio_url=trackio_url,
         | 
| 293 | 
            +
                    trackio_token=trackio_token,
         | 
| 294 | 
            +
                    enable_tracking=enable_tracking,
         | 
| 295 | 
            +
                    log_artifacts=log_artifacts,
         | 
| 296 | 
            +
                    log_metrics=log_metrics,
         | 
| 297 | 
            +
                    log_config=log_config
         | 
| 298 | 
            +
                ) 
         | 
    	
        push_to_huggingface.py
    ADDED
    
    | @@ -0,0 +1,486 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            """
         | 
| 3 | 
            +
            Push Trained Model and Results to Hugging Face Hub
         | 
| 4 | 
            +
            Integrates with Trackio monitoring and provides complete model deployment
         | 
| 5 | 
            +
            """
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            import os
         | 
| 8 | 
            +
            import json
         | 
| 9 | 
            +
            import argparse
         | 
| 10 | 
            +
            import logging
         | 
| 11 | 
            +
            from pathlib import Path
         | 
| 12 | 
            +
            from typing import Dict, Any, Optional, List
         | 
| 13 | 
            +
            from datetime import datetime
         | 
| 14 | 
            +
            import subprocess
         | 
| 15 | 
            +
            import shutil
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            try:
         | 
| 18 | 
            +
                from huggingface_hub import HfApi, create_repo, upload_file
         | 
| 19 | 
            +
                from huggingface_hub import snapshot_download, hf_hub_download
         | 
| 20 | 
            +
                HF_AVAILABLE = True
         | 
| 21 | 
            +
            except ImportError:
         | 
| 22 | 
            +
                HF_AVAILABLE = False
         | 
| 23 | 
            +
                print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            try:
         | 
| 26 | 
            +
                from monitoring import SmolLM3Monitor
         | 
| 27 | 
            +
                MONITORING_AVAILABLE = True
         | 
| 28 | 
            +
            except ImportError:
         | 
| 29 | 
            +
                MONITORING_AVAILABLE = False
         | 
| 30 | 
            +
                print("Warning: monitoring module not available")
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            logger = logging.getLogger(__name__)
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            class HuggingFacePusher:
         | 
| 35 | 
            +
                """Push trained models and results to Hugging Face Hub"""
         | 
| 36 | 
            +
                
         | 
| 37 | 
            +
                def __init__(
         | 
| 38 | 
            +
                    self,
         | 
| 39 | 
            +
                    model_path: str,
         | 
| 40 | 
            +
                    repo_name: str,
         | 
| 41 | 
            +
                    token: Optional[str] = None,
         | 
| 42 | 
            +
                    private: bool = False,
         | 
| 43 | 
            +
                    trackio_url: Optional[str] = None,
         | 
| 44 | 
            +
                    experiment_name: Optional[str] = None
         | 
| 45 | 
            +
                ):
         | 
| 46 | 
            +
                    self.model_path = Path(model_path)
         | 
| 47 | 
            +
                    self.repo_name = repo_name
         | 
| 48 | 
            +
                    self.token = token or os.getenv('HF_TOKEN')
         | 
| 49 | 
            +
                    self.private = private
         | 
| 50 | 
            +
                    self.trackio_url = trackio_url
         | 
| 51 | 
            +
                    self.experiment_name = experiment_name
         | 
| 52 | 
            +
                    
         | 
| 53 | 
            +
                    # Initialize HF API
         | 
| 54 | 
            +
                    if HF_AVAILABLE:
         | 
| 55 | 
            +
                        self.api = HfApi(token=self.token)
         | 
| 56 | 
            +
                    else:
         | 
| 57 | 
            +
                        raise ImportError("huggingface_hub is required. Install with: pip install huggingface_hub")
         | 
| 58 | 
            +
                    
         | 
| 59 | 
            +
                    # Initialize monitoring if available
         | 
| 60 | 
            +
                    self.monitor = None
         | 
| 61 | 
            +
                    if MONITORING_AVAILABLE and trackio_url:
         | 
| 62 | 
            +
                        self.monitor = SmolLM3Monitor(
         | 
| 63 | 
            +
                            experiment_name=experiment_name or "model_push",
         | 
| 64 | 
            +
                            trackio_url=trackio_url,
         | 
| 65 | 
            +
                            enable_tracking=True
         | 
| 66 | 
            +
                        )
         | 
| 67 | 
            +
                    
         | 
| 68 | 
            +
                    logger.info(f"Initialized HuggingFacePusher for {repo_name}")
         | 
| 69 | 
            +
                
         | 
| 70 | 
            +
                def create_repository(self) -> bool:
         | 
| 71 | 
            +
                    """Create the Hugging Face repository"""
         | 
| 72 | 
            +
                    try:
         | 
| 73 | 
            +
                        logger.info(f"Creating repository: {self.repo_name}")
         | 
| 74 | 
            +
                        
         | 
| 75 | 
            +
                        # Create repository
         | 
| 76 | 
            +
                        create_repo(
         | 
| 77 | 
            +
                            repo_id=self.repo_name,
         | 
| 78 | 
            +
                            token=self.token,
         | 
| 79 | 
            +
                            private=self.private,
         | 
| 80 | 
            +
                            exist_ok=True
         | 
| 81 | 
            +
                        )
         | 
| 82 | 
            +
                        
         | 
| 83 | 
            +
                        logger.info(f"✅ Repository created: https://huggingface.co/{self.repo_name}")
         | 
| 84 | 
            +
                        return True
         | 
| 85 | 
            +
                        
         | 
| 86 | 
            +
                    except Exception as e:
         | 
| 87 | 
            +
                        logger.error(f"❌ Failed to create repository: {e}")
         | 
| 88 | 
            +
                        return False
         | 
| 89 | 
            +
                
         | 
| 90 | 
            +
                def validate_model_path(self) -> bool:
         | 
| 91 | 
            +
                    """Validate that the model path contains required files"""
         | 
| 92 | 
            +
                    required_files = [
         | 
| 93 | 
            +
                        "config.json",
         | 
| 94 | 
            +
                        "pytorch_model.bin",
         | 
| 95 | 
            +
                        "tokenizer.json",
         | 
| 96 | 
            +
                        "tokenizer_config.json"
         | 
| 97 | 
            +
                    ]
         | 
| 98 | 
            +
                    
         | 
| 99 | 
            +
                    missing_files = []
         | 
| 100 | 
            +
                    for file in required_files:
         | 
| 101 | 
            +
                        if not (self.model_path / file).exists():
         | 
| 102 | 
            +
                            missing_files.append(file)
         | 
| 103 | 
            +
                    
         | 
| 104 | 
            +
                    if missing_files:
         | 
| 105 | 
            +
                        logger.error(f"❌ Missing required files: {missing_files}")
         | 
| 106 | 
            +
                        return False
         | 
| 107 | 
            +
                    
         | 
| 108 | 
            +
                    logger.info("✅ Model files validated")
         | 
| 109 | 
            +
                    return True
         | 
| 110 | 
            +
                
         | 
| 111 | 
            +
                def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
         | 
| 112 | 
            +
                    """Create a comprehensive model card"""
         | 
| 113 | 
            +
                    model_card = f"""---
         | 
| 114 | 
            +
            language:
         | 
| 115 | 
            +
            - en
         | 
| 116 | 
            +
            license: mit
         | 
| 117 | 
            +
            tags:
         | 
| 118 | 
            +
            - smollm3
         | 
| 119 | 
            +
            - fine-tuned
         | 
| 120 | 
            +
            - text-generation
         | 
| 121 | 
            +
            - transformers
         | 
| 122 | 
            +
            ---
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            # {self.repo_name.split('/')[-1]}
         | 
| 125 | 
            +
             | 
| 126 | 
            +
            This is a fine-tuned SmolLM3 model based on the HuggingFaceTB/SmolLM3-3B architecture.
         | 
| 127 | 
            +
             | 
| 128 | 
            +
            ## Model Details
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            - **Base Model**: HuggingFaceTB/SmolLM3-3B
         | 
| 131 | 
            +
            - **Fine-tuning Method**: Supervised Fine-tuning
         | 
| 132 | 
            +
            - **Training Date**: {datetime.now().strftime('%Y-%m-%d')}
         | 
| 133 | 
            +
            - **Model Size**: {self._get_model_size():.1f} GB
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            ## Training Configuration
         | 
| 136 | 
            +
             | 
| 137 | 
            +
            ```json
         | 
| 138 | 
            +
            {json.dumps(training_config, indent=2)}
         | 
| 139 | 
            +
            ```
         | 
| 140 | 
            +
             | 
| 141 | 
            +
            ## Training Results
         | 
| 142 | 
            +
             | 
| 143 | 
            +
            ```json
         | 
| 144 | 
            +
            {json.dumps(results, indent=2)}
         | 
| 145 | 
            +
            ```
         | 
| 146 | 
            +
             | 
| 147 | 
            +
            ## Usage
         | 
| 148 | 
            +
             | 
| 149 | 
            +
            ```python
         | 
| 150 | 
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         | 
| 151 | 
            +
             | 
| 152 | 
            +
            # Load model and tokenizer
         | 
| 153 | 
            +
            model = AutoModelForCausalLM.from_pretrained("{self.repo_name}")
         | 
| 154 | 
            +
            tokenizer = AutoTokenizer.from_pretrained("{self.repo_name}")
         | 
| 155 | 
            +
             | 
| 156 | 
            +
            # Generate text
         | 
| 157 | 
            +
            inputs = tokenizer("Hello, how are you?", return_tensors="pt")
         | 
| 158 | 
            +
            outputs = model.generate(**inputs, max_new_tokens=100)
         | 
| 159 | 
            +
            print(tokenizer.decode(outputs[0], skip_special_tokens=True))
         | 
| 160 | 
            +
            ```
         | 
| 161 | 
            +
             | 
| 162 | 
            +
            ## Training Information
         | 
| 163 | 
            +
             | 
| 164 | 
            +
            - **Framework**: Transformers
         | 
| 165 | 
            +
            - **Hardware**: {self._get_hardware_info()}
         | 
| 166 | 
            +
            - **Training Time**: {results.get('training_time_hours', 'Unknown')} hours
         | 
| 167 | 
            +
            - **Final Loss**: {results.get('final_loss', 'Unknown')}
         | 
| 168 | 
            +
            - **Final Accuracy**: {results.get('final_accuracy', 'Unknown')}
         | 
| 169 | 
            +
             | 
| 170 | 
            +
            ## Model Performance
         | 
| 171 | 
            +
             | 
| 172 | 
            +
            - **Training Loss**: {results.get('train_loss', 'Unknown')}
         | 
| 173 | 
            +
            - **Validation Loss**: {results.get('eval_loss', 'Unknown')}
         | 
| 174 | 
            +
            - **Training Steps**: {results.get('total_steps', 'Unknown')}
         | 
| 175 | 
            +
             | 
| 176 | 
            +
            ## Limitations and Biases
         | 
| 177 | 
            +
             | 
| 178 | 
            +
            This model is fine-tuned for specific tasks and may not generalize well to all use cases. Please evaluate the model's performance on your specific task before deployment.
         | 
| 179 | 
            +
             | 
| 180 | 
            +
            ## License
         | 
| 181 | 
            +
             | 
| 182 | 
            +
            This model is licensed under the MIT License.
         | 
| 183 | 
            +
            """
         | 
| 184 | 
            +
                    return model_card
         | 
| 185 | 
            +
                
         | 
| 186 | 
            +
                def _get_model_size(self) -> float:
         | 
| 187 | 
            +
                    """Get model size in GB"""
         | 
| 188 | 
            +
                    try:
         | 
| 189 | 
            +
                        total_size = 0
         | 
| 190 | 
            +
                        for file in self.model_path.rglob("*"):
         | 
| 191 | 
            +
                            if file.is_file():
         | 
| 192 | 
            +
                                total_size += file.stat().st_size
         | 
| 193 | 
            +
                        return total_size / (1024**3)  # Convert to GB
         | 
| 194 | 
            +
                    except:
         | 
| 195 | 
            +
                        return 0.0
         | 
| 196 | 
            +
                
         | 
| 197 | 
            +
                def _get_hardware_info(self) -> str:
         | 
| 198 | 
            +
                    """Get hardware information"""
         | 
| 199 | 
            +
                    try:
         | 
| 200 | 
            +
                        import torch
         | 
| 201 | 
            +
                        if torch.cuda.is_available():
         | 
| 202 | 
            +
                            gpu_name = torch.cuda.get_device_name(0)
         | 
| 203 | 
            +
                            return f"GPU: {gpu_name}"
         | 
| 204 | 
            +
                        else:
         | 
| 205 | 
            +
                            return "CPU"
         | 
| 206 | 
            +
                    except:
         | 
| 207 | 
            +
                        return "Unknown"
         | 
| 208 | 
            +
                
         | 
| 209 | 
            +
                def upload_model_files(self) -> bool:
         | 
| 210 | 
            +
                    """Upload model files to Hugging Face Hub"""
         | 
| 211 | 
            +
                    try:
         | 
| 212 | 
            +
                        logger.info("Uploading model files...")
         | 
| 213 | 
            +
                        
         | 
| 214 | 
            +
                        # Upload all files in the model directory
         | 
| 215 | 
            +
                        for file_path in self.model_path.rglob("*"):
         | 
| 216 | 
            +
                            if file_path.is_file():
         | 
| 217 | 
            +
                                relative_path = file_path.relative_to(self.model_path)
         | 
| 218 | 
            +
                                remote_path = str(relative_path)
         | 
| 219 | 
            +
                                
         | 
| 220 | 
            +
                                logger.info(f"Uploading {relative_path}")
         | 
| 221 | 
            +
                                upload_file(
         | 
| 222 | 
            +
                                    path_or_fileobj=str(file_path),
         | 
| 223 | 
            +
                                    path_in_repo=remote_path,
         | 
| 224 | 
            +
                                    repo_id=self.repo_name,
         | 
| 225 | 
            +
                                    token=self.token
         | 
| 226 | 
            +
                                )
         | 
| 227 | 
            +
                        
         | 
| 228 | 
            +
                        logger.info("✅ Model files uploaded successfully")
         | 
| 229 | 
            +
                        return True
         | 
| 230 | 
            +
                        
         | 
| 231 | 
            +
                    except Exception as e:
         | 
| 232 | 
            +
                        logger.error(f"❌ Failed to upload model files: {e}")
         | 
| 233 | 
            +
                        return False
         | 
| 234 | 
            +
                
         | 
| 235 | 
            +
                def upload_training_results(self, results_path: str) -> bool:
         | 
| 236 | 
            +
                    """Upload training results and logs"""
         | 
| 237 | 
            +
                    try:
         | 
| 238 | 
            +
                        logger.info("Uploading training results...")
         | 
| 239 | 
            +
                        
         | 
| 240 | 
            +
                        results_files = [
         | 
| 241 | 
            +
                            "train_results.json",
         | 
| 242 | 
            +
                            "eval_results.json",
         | 
| 243 | 
            +
                            "training_config.json",
         | 
| 244 | 
            +
                            "training.log"
         | 
| 245 | 
            +
                        ]
         | 
| 246 | 
            +
                        
         | 
| 247 | 
            +
                        for file_name in results_files:
         | 
| 248 | 
            +
                            file_path = Path(results_path) / file_name
         | 
| 249 | 
            +
                            if file_path.exists():
         | 
| 250 | 
            +
                                logger.info(f"Uploading {file_name}")
         | 
| 251 | 
            +
                                upload_file(
         | 
| 252 | 
            +
                                    path_or_fileobj=str(file_path),
         | 
| 253 | 
            +
                                    path_in_repo=f"training_results/{file_name}",
         | 
| 254 | 
            +
                                    repo_id=self.repo_name,
         | 
| 255 | 
            +
                                    token=self.token
         | 
| 256 | 
            +
                                )
         | 
| 257 | 
            +
                        
         | 
| 258 | 
            +
                        logger.info("✅ Training results uploaded successfully")
         | 
| 259 | 
            +
                        return True
         | 
| 260 | 
            +
                        
         | 
| 261 | 
            +
                    except Exception as e:
         | 
| 262 | 
            +
                        logger.error(f"❌ Failed to upload training results: {e}")
         | 
| 263 | 
            +
                        return False
         | 
| 264 | 
            +
                
         | 
| 265 | 
            +
                def create_readme(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> bool:
         | 
| 266 | 
            +
                    """Create and upload README.md"""
         | 
| 267 | 
            +
                    try:
         | 
| 268 | 
            +
                        logger.info("Creating README.md...")
         | 
| 269 | 
            +
                        
         | 
| 270 | 
            +
                        readme_content = f"""# {self.repo_name.split('/')[-1]}
         | 
| 271 | 
            +
             | 
| 272 | 
            +
            A fine-tuned SmolLM3 model for text generation tasks.
         | 
| 273 | 
            +
             | 
| 274 | 
            +
            ## Quick Start
         | 
| 275 | 
            +
             | 
| 276 | 
            +
            ```python
         | 
| 277 | 
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         | 
| 278 | 
            +
             | 
| 279 | 
            +
            model = AutoModelForCausalLM.from_pretrained("{self.repo_name}")
         | 
| 280 | 
            +
            tokenizer = AutoTokenizer.from_pretrained("{self.repo_name}")
         | 
| 281 | 
            +
             | 
| 282 | 
            +
            # Generate text
         | 
| 283 | 
            +
            text = "Hello, how are you?"
         | 
| 284 | 
            +
            inputs = tokenizer(text, return_tensors="pt")
         | 
| 285 | 
            +
            outputs = model.generate(**inputs, max_new_tokens=100)
         | 
| 286 | 
            +
            print(tokenizer.decode(outputs[0], skip_special_tokens=True))
         | 
| 287 | 
            +
            ```
         | 
| 288 | 
            +
             | 
| 289 | 
            +
            ## Model Information
         | 
| 290 | 
            +
             | 
| 291 | 
            +
            - **Base Model**: HuggingFaceTB/SmolLM3-3B
         | 
| 292 | 
            +
            - **Fine-tuning Date**: {datetime.now().strftime('%Y-%m-%d')}
         | 
| 293 | 
            +
            - **Model Size**: {self._get_model_size():.1f} GB
         | 
| 294 | 
            +
            - **Training Steps**: {results.get('total_steps', 'Unknown')}
         | 
| 295 | 
            +
            - **Final Loss**: {results.get('final_loss', 'Unknown')}
         | 
| 296 | 
            +
             | 
| 297 | 
            +
            ## Training Configuration
         | 
| 298 | 
            +
             | 
| 299 | 
            +
            ```json
         | 
| 300 | 
            +
            {json.dumps(training_config, indent=2)}
         | 
| 301 | 
            +
            ```
         | 
| 302 | 
            +
             | 
| 303 | 
            +
            ## Performance Metrics
         | 
| 304 | 
            +
             | 
| 305 | 
            +
            ```json
         | 
| 306 | 
            +
            {json.dumps(results, indent=2)}
         | 
| 307 | 
            +
            ```
         | 
| 308 | 
            +
             | 
| 309 | 
            +
            ## Files
         | 
| 310 | 
            +
             | 
| 311 | 
            +
            - `pytorch_model.bin`: Model weights
         | 
| 312 | 
            +
            - `config.json`: Model configuration
         | 
| 313 | 
            +
            - `tokenizer.json`: Tokenizer configuration
         | 
| 314 | 
            +
            - `training_results/`: Training logs and results
         | 
| 315 | 
            +
             | 
| 316 | 
            +
            ## License
         | 
| 317 | 
            +
             | 
| 318 | 
            +
            MIT License
         | 
| 319 | 
            +
            """
         | 
| 320 | 
            +
                        
         | 
| 321 | 
            +
                        # Write README to temporary file
         | 
| 322 | 
            +
                        readme_path = Path("temp_readme.md")
         | 
| 323 | 
            +
                        with open(readme_path, "w") as f:
         | 
| 324 | 
            +
                            f.write(readme_content)
         | 
| 325 | 
            +
                        
         | 
| 326 | 
            +
                        # Upload README
         | 
| 327 | 
            +
                        upload_file(
         | 
| 328 | 
            +
                            path_or_fileobj=str(readme_path),
         | 
| 329 | 
            +
                            path_in_repo="README.md",
         | 
| 330 | 
            +
                            repo_id=self.repo_name,
         | 
| 331 | 
            +
                            token=self.token
         | 
| 332 | 
            +
                        )
         | 
| 333 | 
            +
                        
         | 
| 334 | 
            +
                        # Clean up
         | 
| 335 | 
            +
                        readme_path.unlink()
         | 
| 336 | 
            +
                        
         | 
| 337 | 
            +
                        logger.info("✅ README.md uploaded successfully")
         | 
| 338 | 
            +
                        return True
         | 
| 339 | 
            +
                        
         | 
| 340 | 
            +
                    except Exception as e:
         | 
| 341 | 
            +
                        logger.error(f"❌ Failed to create README: {e}")
         | 
| 342 | 
            +
                        return False
         | 
| 343 | 
            +
                
         | 
| 344 | 
            +
                def log_to_trackio(self, action: str, details: Dict[str, Any]):
         | 
| 345 | 
            +
                    """Log push action to Trackio"""
         | 
| 346 | 
            +
                    if self.monitor:
         | 
| 347 | 
            +
                        try:
         | 
| 348 | 
            +
                            self.monitor.log_metrics({
         | 
| 349 | 
            +
                                "push_action": action,
         | 
| 350 | 
            +
                                "repo_name": self.repo_name,
         | 
| 351 | 
            +
                                "model_size_gb": self._get_model_size(),
         | 
| 352 | 
            +
                                **details
         | 
| 353 | 
            +
                            })
         | 
| 354 | 
            +
                            logger.info(f"✅ Logged {action} to Trackio")
         | 
| 355 | 
            +
                        except Exception as e:
         | 
| 356 | 
            +
                            logger.error(f"❌ Failed to log to Trackio: {e}")
         | 
| 357 | 
            +
                
         | 
| 358 | 
            +
                def push_model(self, training_config: Optional[Dict[str, Any]] = None, 
         | 
| 359 | 
            +
                               results: Optional[Dict[str, Any]] = None) -> bool:
         | 
| 360 | 
            +
                    """Complete model push process"""
         | 
| 361 | 
            +
                    logger.info(f"🚀 Starting model push to {self.repo_name}")
         | 
| 362 | 
            +
                    
         | 
| 363 | 
            +
                    # Validate model path
         | 
| 364 | 
            +
                    if not self.validate_model_path():
         | 
| 365 | 
            +
                        return False
         | 
| 366 | 
            +
                    
         | 
| 367 | 
            +
                    # Create repository
         | 
| 368 | 
            +
                    if not self.create_repository():
         | 
| 369 | 
            +
                        return False
         | 
| 370 | 
            +
                    
         | 
| 371 | 
            +
                    # Load training config and results if not provided
         | 
| 372 | 
            +
                    if training_config is None:
         | 
| 373 | 
            +
                        training_config = self._load_training_config()
         | 
| 374 | 
            +
                    
         | 
| 375 | 
            +
                    if results is None:
         | 
| 376 | 
            +
                        results = self._load_training_results()
         | 
| 377 | 
            +
                    
         | 
| 378 | 
            +
                    # Create and upload model card
         | 
| 379 | 
            +
                    model_card = self.create_model_card(training_config, results)
         | 
| 380 | 
            +
                    model_card_path = Path("temp_model_card.md")
         | 
| 381 | 
            +
                    with open(model_card_path, "w") as f:
         | 
| 382 | 
            +
                        f.write(model_card)
         | 
| 383 | 
            +
                    
         | 
| 384 | 
            +
                    try:
         | 
| 385 | 
            +
                        upload_file(
         | 
| 386 | 
            +
                            path_or_fileobj=str(model_card_path),
         | 
| 387 | 
            +
                            path_in_repo="README.md",
         | 
| 388 | 
            +
                            repo_id=self.repo_name,
         | 
| 389 | 
            +
                            token=self.token
         | 
| 390 | 
            +
                        )
         | 
| 391 | 
            +
                    finally:
         | 
| 392 | 
            +
                        model_card_path.unlink()
         | 
| 393 | 
            +
                    
         | 
| 394 | 
            +
                    # Upload model files
         | 
| 395 | 
            +
                    if not self.upload_model_files():
         | 
| 396 | 
            +
                        return False
         | 
| 397 | 
            +
                    
         | 
| 398 | 
            +
                    # Upload training results
         | 
| 399 | 
            +
                    if results:
         | 
| 400 | 
            +
                        self.upload_training_results(str(self.model_path))
         | 
| 401 | 
            +
                    
         | 
| 402 | 
            +
                    # Log to Trackio
         | 
| 403 | 
            +
                    self.log_to_trackio("model_push", {
         | 
| 404 | 
            +
                        "model_path": str(self.model_path),
         | 
| 405 | 
            +
                        "repo_name": self.repo_name,
         | 
| 406 | 
            +
                        "private": self.private,
         | 
| 407 | 
            +
                        "training_config": training_config,
         | 
| 408 | 
            +
                        "results": results
         | 
| 409 | 
            +
                    })
         | 
| 410 | 
            +
                    
         | 
| 411 | 
            +
                    logger.info(f"🎉 Model successfully pushed to: https://huggingface.co/{self.repo_name}")
         | 
| 412 | 
            +
                    return True
         | 
| 413 | 
            +
                
         | 
| 414 | 
            +
                def _load_training_config(self) -> Dict[str, Any]:
         | 
| 415 | 
            +
                    """Load training configuration"""
         | 
| 416 | 
            +
                    config_path = self.model_path / "training_config.json"
         | 
| 417 | 
            +
                    if config_path.exists():
         | 
| 418 | 
            +
                        with open(config_path, "r") as f:
         | 
| 419 | 
            +
                            return json.load(f)
         | 
| 420 | 
            +
                    return {"model_name": "HuggingFaceTB/SmolLM3-3B"}
         | 
| 421 | 
            +
                
         | 
| 422 | 
            +
                def _load_training_results(self) -> Dict[str, Any]:
         | 
| 423 | 
            +
                    """Load training results"""
         | 
| 424 | 
            +
                    results_path = self.model_path / "train_results.json"
         | 
| 425 | 
            +
                    if results_path.exists():
         | 
| 426 | 
            +
                        with open(results_path, "r") as f:
         | 
| 427 | 
            +
                            return json.load(f)
         | 
| 428 | 
            +
                    return {"final_loss": "Unknown", "total_steps": "Unknown"}
         | 
| 429 | 
            +
             | 
| 430 | 
            +
            def parse_args():
         | 
| 431 | 
            +
                """Parse command line arguments"""
         | 
| 432 | 
            +
                parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')
         | 
| 433 | 
            +
                
         | 
| 434 | 
            +
                # Required arguments
         | 
| 435 | 
            +
                parser.add_argument('model_path', type=str, help='Path to trained model directory')
         | 
| 436 | 
            +
                parser.add_argument('repo_name', type=str, help='Hugging Face repository name (username/repo-name)')
         | 
| 437 | 
            +
                
         | 
| 438 | 
            +
                # Optional arguments
         | 
| 439 | 
            +
                parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
         | 
| 440 | 
            +
                parser.add_argument('--private', action='store_true', help='Make repository private')
         | 
| 441 | 
            +
                parser.add_argument('--trackio-url', type=str, default=None, help='Trackio Space URL for logging')
         | 
| 442 | 
            +
                parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for Trackio')
         | 
| 443 | 
            +
                
         | 
| 444 | 
            +
                return parser.parse_args()
         | 
| 445 | 
            +
             | 
| 446 | 
            +
            def main():
         | 
| 447 | 
            +
                """Main function"""
         | 
| 448 | 
            +
                args = parse_args()
         | 
| 449 | 
            +
                
         | 
| 450 | 
            +
                # Setup logging
         | 
| 451 | 
            +
                logging.basicConfig(
         | 
| 452 | 
            +
                    level=logging.INFO,
         | 
| 453 | 
            +
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
         | 
| 454 | 
            +
                )
         | 
| 455 | 
            +
                
         | 
| 456 | 
            +
                logger.info("Starting model push to Hugging Face Hub")
         | 
| 457 | 
            +
                
         | 
| 458 | 
            +
                # Initialize pusher
         | 
| 459 | 
            +
                try:
         | 
| 460 | 
            +
                    pusher = HuggingFacePusher(
         | 
| 461 | 
            +
                        model_path=args.model_path,
         | 
| 462 | 
            +
                        repo_name=args.repo_name,
         | 
| 463 | 
            +
                        token=args.token,
         | 
| 464 | 
            +
                        private=args.private,
         | 
| 465 | 
            +
                        trackio_url=args.trackio_url,
         | 
| 466 | 
            +
                        experiment_name=args.experiment_name
         | 
| 467 | 
            +
                    )
         | 
| 468 | 
            +
                    
         | 
| 469 | 
            +
                    # Push model
         | 
| 470 | 
            +
                    success = pusher.push_model()
         | 
| 471 | 
            +
                    
         | 
| 472 | 
            +
                    if success:
         | 
| 473 | 
            +
                        logger.info("✅ Model push completed successfully!")
         | 
| 474 | 
            +
                        logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
         | 
| 475 | 
            +
                    else:
         | 
| 476 | 
            +
                        logger.error("❌ Model push failed!")
         | 
| 477 | 
            +
                        return 1
         | 
| 478 | 
            +
                        
         | 
| 479 | 
            +
                except Exception as e:
         | 
| 480 | 
            +
                    logger.error(f"❌ Error during model push: {e}")
         | 
| 481 | 
            +
                    return 1
         | 
| 482 | 
            +
                
         | 
| 483 | 
            +
                return 0
         | 
| 484 | 
            +
             | 
| 485 | 
            +
            if __name__ == "__main__":
         | 
| 486 | 
            +
                exit(main()) 
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -32,4 +32,11 @@ sentencepiece>=0.1.99 | |
| 32 | 
             
            # Development
         | 
| 33 | 
             
            pytest>=7.0.0
         | 
| 34 | 
             
            black>=23.0.0
         | 
| 35 | 
            -
            isort>=5.12.0 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 32 | 
             
            # Development
         | 
| 33 | 
             
            pytest>=7.0.0
         | 
| 34 | 
             
            black>=23.0.0
         | 
| 35 | 
            +
            isort>=5.12.0
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            # Experiment tracking and monitoring
         | 
| 38 | 
            +
            trackio>=0.1.0
         | 
| 39 | 
            +
            psutil>=5.9.0
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            # Hugging Face Hub integration
         | 
| 42 | 
            +
            huggingface_hub>=0.16.0 
         | 
    	
        requirements_space.txt
    ADDED
    
    | @@ -0,0 +1,18 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Gradio and web interface
         | 
| 2 | 
            +
            gradio>=4.0.0
         | 
| 3 | 
            +
            gradio-client>=0.10.0
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            # Core dependencies for Trackio Space
         | 
| 6 | 
            +
            requests>=2.31.0
         | 
| 7 | 
            +
            numpy>=1.24.0
         | 
| 8 | 
            +
            pandas>=2.0.0
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            # JSON and data handling
         | 
| 11 | 
            +
            jsonschema>=4.17.0
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            # Optional: for better UI
         | 
| 14 | 
            +
            plotly>=5.15.0
         | 
| 15 | 
            +
            matplotlib>=3.7.0
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            # Development and debugging
         | 
| 18 | 
            +
            python-dotenv>=1.0.0 
         | 
    	
        run_a100_large_experiment.py
    ADDED
    
    | @@ -0,0 +1,134 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            """
         | 
| 3 | 
            +
            Script to run A100 large-scale experiments on OpenHermes-FR dataset
         | 
| 4 | 
            +
            Supports multiple configurations for different training scenarios
         | 
| 5 | 
            +
            """
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            import argparse
         | 
| 8 | 
            +
            import os
         | 
| 9 | 
            +
            import sys
         | 
| 10 | 
            +
            from pathlib import Path
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            def main():
         | 
| 13 | 
            +
                parser = argparse.ArgumentParser(description="Run A100 large-scale experiments")
         | 
| 14 | 
            +
                parser.add_argument(
         | 
| 15 | 
            +
                    "--config", 
         | 
| 16 | 
            +
                    type=str, 
         | 
| 17 | 
            +
                    default="config/train_smollm3_openhermes_fr_a100_large.py",
         | 
| 18 | 
            +
                    help="Configuration file to use"
         | 
| 19 | 
            +
                )
         | 
| 20 | 
            +
                parser.add_argument(
         | 
| 21 | 
            +
                    "--experiment-name",
         | 
| 22 | 
            +
                    type=str,
         | 
| 23 | 
            +
                    help="Custom experiment name for tracking"
         | 
| 24 | 
            +
                )
         | 
| 25 | 
            +
                parser.add_argument(
         | 
| 26 | 
            +
                    "--output-dir",
         | 
| 27 | 
            +
                    type=str,
         | 
| 28 | 
            +
                    default="./outputs",
         | 
| 29 | 
            +
                    help="Output directory for checkpoints and logs"
         | 
| 30 | 
            +
                )
         | 
| 31 | 
            +
                parser.add_argument(
         | 
| 32 | 
            +
                    "--resume",
         | 
| 33 | 
            +
                    type=str,
         | 
| 34 | 
            +
                    help="Resume training from checkpoint"
         | 
| 35 | 
            +
                )
         | 
| 36 | 
            +
                parser.add_argument(
         | 
| 37 | 
            +
                    "--dry-run",
         | 
| 38 | 
            +
                    action="store_true",
         | 
| 39 | 
            +
                    help="Print configuration without starting training"
         | 
| 40 | 
            +
                )
         | 
| 41 | 
            +
                
         | 
| 42 | 
            +
                args = parser.parse_args()
         | 
| 43 | 
            +
                
         | 
| 44 | 
            +
                # Add the current directory to Python path
         | 
| 45 | 
            +
                sys.path.insert(0, str(Path(__file__).parent))
         | 
| 46 | 
            +
                
         | 
| 47 | 
            +
                # Import the configuration
         | 
| 48 | 
            +
                try:
         | 
| 49 | 
            +
                    from config.train_smollm3_openhermes_fr_a100_large import get_config as get_large_config
         | 
| 50 | 
            +
                    from config.train_smollm3_openhermes_fr_a100_multiple_passes import get_config as get_multiple_passes_config
         | 
| 51 | 
            +
                    
         | 
| 52 | 
            +
                    # Map config files to their respective functions
         | 
| 53 | 
            +
                    config_map = {
         | 
| 54 | 
            +
                        "config/train_smollm3_openhermes_fr_a100_large.py": get_large_config,
         | 
| 55 | 
            +
                        "config/train_smollm3_openhermes_fr_a100_multiple_passes.py": get_multiple_passes_config,
         | 
| 56 | 
            +
                    }
         | 
| 57 | 
            +
                    
         | 
| 58 | 
            +
                    if args.config in config_map:
         | 
| 59 | 
            +
                        config = config_map[args.config](args.config)
         | 
| 60 | 
            +
                    else:
         | 
| 61 | 
            +
                        # Try to load from the specified config file
         | 
| 62 | 
            +
                        config = get_large_config(args.config)
         | 
| 63 | 
            +
                        
         | 
| 64 | 
            +
                except ImportError as e:
         | 
| 65 | 
            +
                    print(f"Error importing configuration: {e}")
         | 
| 66 | 
            +
                    print("Available configurations:")
         | 
| 67 | 
            +
                    print("  - config/train_smollm3_openhermes_fr_a100_large.py (Large batch, 1.3 passes)")
         | 
| 68 | 
            +
                    print("  - config/train_smollm3_openhermes_fr_a100_multiple_passes.py (Multiple passes, 4 epochs)")
         | 
| 69 | 
            +
                    return 1
         | 
| 70 | 
            +
                
         | 
| 71 | 
            +
                # Override experiment name if provided
         | 
| 72 | 
            +
                if args.experiment_name:
         | 
| 73 | 
            +
                    config.experiment_name = args.experiment_name
         | 
| 74 | 
            +
                
         | 
| 75 | 
            +
                # Create output directory
         | 
| 76 | 
            +
                os.makedirs(args.output_dir, exist_ok=True)
         | 
| 77 | 
            +
                
         | 
| 78 | 
            +
                # Print configuration summary
         | 
| 79 | 
            +
                print(f"\n{'='*60}")
         | 
| 80 | 
            +
                print(f"EXPERIMENT CONFIGURATION")
         | 
| 81 | 
            +
                print(f"{'='*60}")
         | 
| 82 | 
            +
                print(f"Config file: {args.config}")
         | 
| 83 | 
            +
                print(f"Experiment name: {config.experiment_name}")
         | 
| 84 | 
            +
                print(f"Output directory: {args.output_dir}")
         | 
| 85 | 
            +
                print(f"Model: {config.model_name}")
         | 
| 86 | 
            +
                print(f"Batch size: {config.batch_size}")
         | 
| 87 | 
            +
                print(f"Gradient accumulation: {config.gradient_accumulation_steps}")
         | 
| 88 | 
            +
                print(f"Effective batch size: {config.batch_size * config.gradient_accumulation_steps}")
         | 
| 89 | 
            +
                print(f"Learning rate: {config.learning_rate}")
         | 
| 90 | 
            +
                print(f"Max iterations: {config.max_iters}")
         | 
| 91 | 
            +
                print(f"Max sequence length: {config.max_seq_length}")
         | 
| 92 | 
            +
                print(f"Mixed precision: {'bf16' if config.bf16 else 'fp16'}")
         | 
| 93 | 
            +
                print(f"Dataset: {config.dataset_name}")
         | 
| 94 | 
            +
                print(f"{'='*60}\n")
         | 
| 95 | 
            +
                
         | 
| 96 | 
            +
                if args.dry_run:
         | 
| 97 | 
            +
                    print("DRY RUN - Configuration printed above. Use without --dry-run to start training.")
         | 
| 98 | 
            +
                    return 0
         | 
| 99 | 
            +
                
         | 
| 100 | 
            +
                # Import and run training
         | 
| 101 | 
            +
                try:
         | 
| 102 | 
            +
                    from train import main as train_main
         | 
| 103 | 
            +
                    
         | 
| 104 | 
            +
                    # Set up training arguments
         | 
| 105 | 
            +
                    train_args = [
         | 
| 106 | 
            +
                        "--config", args.config,
         | 
| 107 | 
            +
                        "--output-dir", args.output_dir,
         | 
| 108 | 
            +
                    ]
         | 
| 109 | 
            +
                    
         | 
| 110 | 
            +
                    if args.resume:
         | 
| 111 | 
            +
                        train_args.extend(["--resume", args.resume])
         | 
| 112 | 
            +
                    
         | 
| 113 | 
            +
                    # Override sys.argv for the training script
         | 
| 114 | 
            +
                    original_argv = sys.argv
         | 
| 115 | 
            +
                    sys.argv = ["train.py"] + train_args
         | 
| 116 | 
            +
                    
         | 
| 117 | 
            +
                    # Run training
         | 
| 118 | 
            +
                    train_main()
         | 
| 119 | 
            +
                    
         | 
| 120 | 
            +
                    # Restore original argv
         | 
| 121 | 
            +
                    sys.argv = original_argv
         | 
| 122 | 
            +
                    
         | 
| 123 | 
            +
                except ImportError as e:
         | 
| 124 | 
            +
                    print(f"Error importing training module: {e}")
         | 
| 125 | 
            +
                    print("Make sure train.py is available in the current directory.")
         | 
| 126 | 
            +
                    return 1
         | 
| 127 | 
            +
                except Exception as e:
         | 
| 128 | 
            +
                    print(f"Error during training: {e}")
         | 
| 129 | 
            +
                    return 1
         | 
| 130 | 
            +
                
         | 
| 131 | 
            +
                return 0
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            if __name__ == "__main__":
         | 
| 134 | 
            +
                exit(main()) 
         | 
    	
        test_monitoring.py
    ADDED
    
    | @@ -0,0 +1,181 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            """
         | 
| 3 | 
            +
            Quick Start Script for Trackio Integration
         | 
| 4 | 
            +
            Tests the monitoring functionality without full training
         | 
| 5 | 
            +
            """
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            import os
         | 
| 8 | 
            +
            import json
         | 
| 9 | 
            +
            import logging
         | 
| 10 | 
            +
            from datetime import datetime
         | 
| 11 | 
            +
            from monitoring import SmolLM3Monitor
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            def setup_logging():
         | 
| 14 | 
            +
                """Setup logging configuration"""
         | 
| 15 | 
            +
                logging.basicConfig(
         | 
| 16 | 
            +
                    level=logging.INFO,
         | 
| 17 | 
            +
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
         | 
| 18 | 
            +
                )
         | 
| 19 | 
            +
                return logging.getLogger(__name__)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            def test_trackio_integration():
         | 
| 22 | 
            +
                """Test Trackio integration with sample data"""
         | 
| 23 | 
            +
                logger = setup_logging()
         | 
| 24 | 
            +
                
         | 
| 25 | 
            +
                print("🚀 Testing Trackio Integration")
         | 
| 26 | 
            +
                print("=" * 40)
         | 
| 27 | 
            +
                
         | 
| 28 | 
            +
                # Get Trackio URL from user or environment
         | 
| 29 | 
            +
                trackio_url = os.getenv('TRACKIO_URL')
         | 
| 30 | 
            +
                if not trackio_url:
         | 
| 31 | 
            +
                    trackio_url = input("Enter your Trackio Space URL (or press Enter to skip): ").strip()
         | 
| 32 | 
            +
                    if not trackio_url:
         | 
| 33 | 
            +
                        print("⚠️  No Trackio URL provided. Running in local mode only.")
         | 
| 34 | 
            +
                        trackio_url = None
         | 
| 35 | 
            +
                
         | 
| 36 | 
            +
                # Initialize monitor
         | 
| 37 | 
            +
                experiment_name = f"test_experiment_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         | 
| 38 | 
            +
                
         | 
| 39 | 
            +
                monitor = SmolLM3Monitor(
         | 
| 40 | 
            +
                    experiment_name=experiment_name,
         | 
| 41 | 
            +
                    trackio_url=trackio_url,
         | 
| 42 | 
            +
                    enable_tracking=trackio_url is not None,
         | 
| 43 | 
            +
                    log_artifacts=True,
         | 
| 44 | 
            +
                    log_metrics=True,
         | 
| 45 | 
            +
                    log_config=True
         | 
| 46 | 
            +
                )
         | 
| 47 | 
            +
                
         | 
| 48 | 
            +
                print(f"✅ Monitor initialized for experiment: {experiment_name}")
         | 
| 49 | 
            +
                
         | 
| 50 | 
            +
                # Test configuration logging
         | 
| 51 | 
            +
                sample_config = {
         | 
| 52 | 
            +
                    "model_name": "HuggingFaceTB/SmolLM3-3B",
         | 
| 53 | 
            +
                    "batch_size": 4,
         | 
| 54 | 
            +
                    "learning_rate": 2e-5,
         | 
| 55 | 
            +
                    "max_iters": 1000,
         | 
| 56 | 
            +
                    "max_seq_length": 4096,
         | 
| 57 | 
            +
                    "test_mode": True
         | 
| 58 | 
            +
                }
         | 
| 59 | 
            +
                
         | 
| 60 | 
            +
                print("📝 Logging configuration...")
         | 
| 61 | 
            +
                monitor.log_config(sample_config)
         | 
| 62 | 
            +
                
         | 
| 63 | 
            +
                # Test metrics logging
         | 
| 64 | 
            +
                print("📊 Logging sample metrics...")
         | 
| 65 | 
            +
                for step in range(0, 100, 10):
         | 
| 66 | 
            +
                    metrics = {
         | 
| 67 | 
            +
                        "loss": 2.0 - (step * 0.015),  # Simulate decreasing loss
         | 
| 68 | 
            +
                        "accuracy": 0.5 + (step * 0.004),  # Simulate increasing accuracy
         | 
| 69 | 
            +
                        "learning_rate": 2e-5,
         | 
| 70 | 
            +
                        "step": step
         | 
| 71 | 
            +
                    }
         | 
| 72 | 
            +
                    monitor.log_metrics(metrics, step=step)
         | 
| 73 | 
            +
                    print(f"   Step {step}: loss={metrics['loss']:.3f}, accuracy={metrics['accuracy']:.3f}")
         | 
| 74 | 
            +
                
         | 
| 75 | 
            +
                # Test system metrics
         | 
| 76 | 
            +
                print("💻 Logging system metrics...")
         | 
| 77 | 
            +
                monitor.log_system_metrics(step=50)
         | 
| 78 | 
            +
                
         | 
| 79 | 
            +
                # Test evaluation results
         | 
| 80 | 
            +
                print("📈 Logging evaluation results...")
         | 
| 81 | 
            +
                eval_results = {
         | 
| 82 | 
            +
                    "eval_loss": 1.2,
         | 
| 83 | 
            +
                    "eval_accuracy": 0.85,
         | 
| 84 | 
            +
                    "perplexity": 3.3,
         | 
| 85 | 
            +
                    "bleu_score": 0.72
         | 
| 86 | 
            +
                }
         | 
| 87 | 
            +
                monitor.log_evaluation_results(eval_results, step=100)
         | 
| 88 | 
            +
                
         | 
| 89 | 
            +
                # Test training summary
         | 
| 90 | 
            +
                print("📋 Logging training summary...")
         | 
| 91 | 
            +
                summary = {
         | 
| 92 | 
            +
                    "final_loss": 0.5,
         | 
| 93 | 
            +
                    "final_accuracy": 0.89,
         | 
| 94 | 
            +
                    "total_steps": 100,
         | 
| 95 | 
            +
                    "training_time_hours": 2.5,
         | 
| 96 | 
            +
                    "model_size_gb": 6.2,
         | 
| 97 | 
            +
                    "test_mode": True
         | 
| 98 | 
            +
                }
         | 
| 99 | 
            +
                monitor.log_training_summary(summary)
         | 
| 100 | 
            +
                
         | 
| 101 | 
            +
                # Close monitoring
         | 
| 102 | 
            +
                monitor.close()
         | 
| 103 | 
            +
                
         | 
| 104 | 
            +
                print("✅ Trackio integration test completed!")
         | 
| 105 | 
            +
                
         | 
| 106 | 
            +
                if trackio_url:
         | 
| 107 | 
            +
                    experiment_url = monitor.get_experiment_url()
         | 
| 108 | 
            +
                    if experiment_url:
         | 
| 109 | 
            +
                        print(f"🌐 View your experiment at: {experiment_url}")
         | 
| 110 | 
            +
                
         | 
| 111 | 
            +
                return True
         | 
| 112 | 
            +
             | 
| 113 | 
            +
            def test_local_monitoring():
         | 
| 114 | 
            +
                """Test local monitoring without Trackio"""
         | 
| 115 | 
            +
                logger = setup_logging()
         | 
| 116 | 
            +
                
         | 
| 117 | 
            +
                print("🔧 Testing Local Monitoring")
         | 
| 118 | 
            +
                print("=" * 30)
         | 
| 119 | 
            +
                
         | 
| 120 | 
            +
                # Initialize monitor without Trackio
         | 
| 121 | 
            +
                experiment_name = f"local_test_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         | 
| 122 | 
            +
                
         | 
| 123 | 
            +
                monitor = SmolLM3Monitor(
         | 
| 124 | 
            +
                    experiment_name=experiment_name,
         | 
| 125 | 
            +
                    enable_tracking=False,  # Disable Trackio
         | 
| 126 | 
            +
                    log_artifacts=True,
         | 
| 127 | 
            +
                    log_metrics=True,
         | 
| 128 | 
            +
                    log_config=True
         | 
| 129 | 
            +
                )
         | 
| 130 | 
            +
                
         | 
| 131 | 
            +
                print(f"✅ Local monitor initialized for experiment: {experiment_name}")
         | 
| 132 | 
            +
                
         | 
| 133 | 
            +
                # Test local logging
         | 
| 134 | 
            +
                sample_config = {
         | 
| 135 | 
            +
                    "model_name": "HuggingFaceTB/SmolLM3-3B",
         | 
| 136 | 
            +
                    "batch_size": 4,
         | 
| 137 | 
            +
                    "learning_rate": 2e-5,
         | 
| 138 | 
            +
                    "local_test": True
         | 
| 139 | 
            +
                }
         | 
| 140 | 
            +
                
         | 
| 141 | 
            +
                print("📝 Logging configuration locally...")
         | 
| 142 | 
            +
                monitor.log_config(sample_config)
         | 
| 143 | 
            +
                
         | 
| 144 | 
            +
                # Test local metrics
         | 
| 145 | 
            +
                print("📊 Logging sample metrics locally...")
         | 
| 146 | 
            +
                for step in range(0, 50, 10):
         | 
| 147 | 
            +
                    metrics = {
         | 
| 148 | 
            +
                        "loss": 1.8 - (step * 0.02),
         | 
| 149 | 
            +
                        "accuracy": 0.6 + (step * 0.005),
         | 
| 150 | 
            +
                        "step": step
         | 
| 151 | 
            +
                    }
         | 
| 152 | 
            +
                    monitor.log_metrics(metrics, step=step)
         | 
| 153 | 
            +
                    print(f"   Step {step}: loss={metrics['loss']:.3f}, accuracy={metrics['accuracy']:.3f}")
         | 
| 154 | 
            +
                
         | 
| 155 | 
            +
                print("✅ Local monitoring test completed!")
         | 
| 156 | 
            +
                return True
         | 
| 157 | 
            +
             | 
| 158 | 
            +
            def main():
         | 
| 159 | 
            +
                """Main function"""
         | 
| 160 | 
            +
                print("Trackio Integration Quick Start")
         | 
| 161 | 
            +
                print("=" * 40)
         | 
| 162 | 
            +
                
         | 
| 163 | 
            +
                # Test local monitoring first
         | 
| 164 | 
            +
                test_local_monitoring()
         | 
| 165 | 
            +
                print()
         | 
| 166 | 
            +
                
         | 
| 167 | 
            +
                # Test Trackio integration if available
         | 
| 168 | 
            +
                try:
         | 
| 169 | 
            +
                    test_trackio_integration()
         | 
| 170 | 
            +
                except Exception as e:
         | 
| 171 | 
            +
                    print(f"❌ Trackio integration test failed: {e}")
         | 
| 172 | 
            +
                    print("💡 Make sure you have a valid Trackio Space URL")
         | 
| 173 | 
            +
                
         | 
| 174 | 
            +
                print("\n🎉 Quick start completed!")
         | 
| 175 | 
            +
                print("\nNext steps:")
         | 
| 176 | 
            +
                print("1. Deploy Trackio to Hugging Face Spaces (see DEPLOYMENT_GUIDE.md)")
         | 
| 177 | 
            +
                print("2. Update your training script with Trackio integration")
         | 
| 178 | 
            +
                print("3. Run your first monitored training session")
         | 
| 179 | 
            +
             | 
| 180 | 
            +
            if __name__ == "__main__":
         | 
| 181 | 
            +
                main() 
         | 
    	
        train.py
    CHANGED
    
    | @@ -76,6 +76,16 @@ def parse_args(): | |
| 76 | 
             
                parser.add_argument('--logging_steps', type=int, default=10,
         | 
| 77 | 
             
                                   help='Log every N steps')
         | 
| 78 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 79 | 
             
                return parser.parse_args()
         | 
| 80 |  | 
| 81 | 
             
            def main():
         | 
| @@ -99,14 +109,22 @@ def main(): | |
| 99 | 
             
                if args.gradient_accumulation_steps is not None:
         | 
| 100 | 
             
                    config.gradient_accumulation_steps = args.gradient_accumulation_steps
         | 
| 101 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 102 | 
             
                # Setup paths
         | 
| 103 | 
            -
                dataset_path = os.path.join('/input', args.dataset_dir)
         | 
| 104 | 
             
                output_path = args.out_dir
         | 
| 105 |  | 
| 106 | 
             
                # Ensure output directory exists
         | 
| 107 | 
             
                os.makedirs(output_path, exist_ok=True)
         | 
| 108 |  | 
| 109 | 
            -
                logger.info(f"Dataset path: {dataset_path}")
         | 
| 110 | 
             
                logger.info(f"Output path: {output_path}")
         | 
| 111 |  | 
| 112 | 
             
                # Initialize model
         | 
| @@ -116,11 +134,23 @@ def main(): | |
| 116 | 
             
                    config=config
         | 
| 117 | 
             
                )
         | 
| 118 |  | 
| 119 | 
            -
                #  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 120 | 
             
                dataset = SmolLM3Dataset(
         | 
| 121 | 
             
                    data_path=dataset_path,
         | 
| 122 | 
             
                    tokenizer=model.tokenizer,
         | 
| 123 | 
            -
                    max_seq_length=args.max_seq_length
         | 
|  | |
|  | |
| 124 | 
             
                )
         | 
| 125 |  | 
| 126 | 
             
                # Initialize trainer
         | 
|  | |
| 76 | 
             
                parser.add_argument('--logging_steps', type=int, default=10,
         | 
| 77 | 
             
                                   help='Log every N steps')
         | 
| 78 |  | 
| 79 | 
            +
                # Trackio monitoring arguments
         | 
| 80 | 
            +
                parser.add_argument('--enable_tracking', action='store_true', default=True,
         | 
| 81 | 
            +
                                   help='Enable Trackio experiment tracking')
         | 
| 82 | 
            +
                parser.add_argument('--trackio_url', type=str, default=None,
         | 
| 83 | 
            +
                                   help='Trackio server URL')
         | 
| 84 | 
            +
                parser.add_argument('--trackio_token', type=str, default=None,
         | 
| 85 | 
            +
                                   help='Trackio authentication token')
         | 
| 86 | 
            +
                parser.add_argument('--experiment_name', type=str, default=None,
         | 
| 87 | 
            +
                                   help='Custom experiment name for tracking')
         | 
| 88 | 
            +
                
         | 
| 89 | 
             
                return parser.parse_args()
         | 
| 90 |  | 
| 91 | 
             
            def main():
         | 
|  | |
| 109 | 
             
                if args.gradient_accumulation_steps is not None:
         | 
| 110 | 
             
                    config.gradient_accumulation_steps = args.gradient_accumulation_steps
         | 
| 111 |  | 
| 112 | 
            +
                # Override Trackio configuration
         | 
| 113 | 
            +
                if args.enable_tracking is not None:
         | 
| 114 | 
            +
                    config.enable_tracking = args.enable_tracking
         | 
| 115 | 
            +
                if args.trackio_url is not None:
         | 
| 116 | 
            +
                    config.trackio_url = args.trackio_url
         | 
| 117 | 
            +
                if args.trackio_token is not None:
         | 
| 118 | 
            +
                    config.trackio_token = args.trackio_token
         | 
| 119 | 
            +
                if args.experiment_name is not None:
         | 
| 120 | 
            +
                    config.experiment_name = args.experiment_name
         | 
| 121 | 
            +
                
         | 
| 122 | 
             
                # Setup paths
         | 
|  | |
| 123 | 
             
                output_path = args.out_dir
         | 
| 124 |  | 
| 125 | 
             
                # Ensure output directory exists
         | 
| 126 | 
             
                os.makedirs(output_path, exist_ok=True)
         | 
| 127 |  | 
|  | |
| 128 | 
             
                logger.info(f"Output path: {output_path}")
         | 
| 129 |  | 
| 130 | 
             
                # Initialize model
         | 
|  | |
| 134 | 
             
                    config=config
         | 
| 135 | 
             
                )
         | 
| 136 |  | 
| 137 | 
            +
                # Determine dataset path
         | 
| 138 | 
            +
                if hasattr(config, 'dataset_name') and config.dataset_name:
         | 
| 139 | 
            +
                    # Use Hugging Face dataset
         | 
| 140 | 
            +
                    dataset_path = config.dataset_name
         | 
| 141 | 
            +
                    logger.info(f"Using Hugging Face dataset: {dataset_path}")
         | 
| 142 | 
            +
                else:
         | 
| 143 | 
            +
                    # Use local dataset
         | 
| 144 | 
            +
                    dataset_path = os.path.join('/input', args.dataset_dir)
         | 
| 145 | 
            +
                    logger.info(f"Using local dataset: {dataset_path}")
         | 
| 146 | 
            +
                
         | 
| 147 | 
            +
                # Load dataset with filtering options
         | 
| 148 | 
             
                dataset = SmolLM3Dataset(
         | 
| 149 | 
             
                    data_path=dataset_path,
         | 
| 150 | 
             
                    tokenizer=model.tokenizer,
         | 
| 151 | 
            +
                    max_seq_length=args.max_seq_length,
         | 
| 152 | 
            +
                    filter_bad_entries=getattr(config, 'filter_bad_entries', False),
         | 
| 153 | 
            +
                    bad_entry_field=getattr(config, 'bad_entry_field', 'bad_entry')
         | 
| 154 | 
             
                )
         | 
| 155 |  | 
| 156 | 
             
                # Initialize trainer
         | 
    	
        trainer.py
    CHANGED
    
    | @@ -11,6 +11,9 @@ from transformers import Trainer, TrainingArguments | |
| 11 | 
             
            from trl import SFTTrainer
         | 
| 12 | 
             
            import json
         | 
| 13 |  | 
|  | |
|  | |
|  | |
| 14 | 
             
            logger = logging.getLogger(__name__)
         | 
| 15 |  | 
| 16 | 
             
            class SmolLM3Trainer:
         | 
| @@ -32,6 +35,9 @@ class SmolLM3Trainer: | |
| 32 | 
             
                    self.init_from = init_from
         | 
| 33 | 
             
                    self.use_sft_trainer = use_sft_trainer
         | 
| 34 |  | 
|  | |
|  | |
|  | |
| 35 | 
             
                    # Setup trainer
         | 
| 36 | 
             
                    self.trainer = self._setup_trainer()
         | 
| 37 |  | 
| @@ -55,6 +61,13 @@ class SmolLM3Trainer: | |
| 55 | 
             
                    # Get data collator
         | 
| 56 | 
             
                    data_collator = self.dataset.get_data_collator()
         | 
| 57 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 58 | 
             
                    if self.use_sft_trainer:
         | 
| 59 | 
             
                        # Use SFTTrainer for supervised fine-tuning
         | 
| 60 | 
             
                        trainer = SFTTrainer(
         | 
| @@ -67,6 +80,7 @@ class SmolLM3Trainer: | |
| 67 | 
             
                            dataset_text_field="text",
         | 
| 68 | 
             
                            max_seq_length=self.config.max_seq_length,
         | 
| 69 | 
             
                            packing=False,  # Disable packing for better control
         | 
|  | |
| 70 | 
             
                        )
         | 
| 71 | 
             
                    else:
         | 
| 72 | 
             
                        # Use standard Trainer
         | 
| @@ -77,6 +91,7 @@ class SmolLM3Trainer: | |
| 77 | 
             
                            train_dataset=train_dataset,
         | 
| 78 | 
             
                            eval_dataset=eval_dataset,
         | 
| 79 | 
             
                            data_collator=data_collator,
         | 
|  | |
| 80 | 
             
                        )
         | 
| 81 |  | 
| 82 | 
             
                    return trainer
         | 
| @@ -103,6 +118,17 @@ class SmolLM3Trainer: | |
| 103 | 
             
                    """Start training"""
         | 
| 104 | 
             
                    logger.info("Starting training")
         | 
| 105 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 106 | 
             
                    # Load checkpoint if resuming
         | 
| 107 | 
             
                    if self.init_from == "resume":
         | 
| 108 | 
             
                        checkpoint_path = "/input-checkpoint"
         | 
| @@ -122,11 +148,26 @@ class SmolLM3Trainer: | |
| 122 | 
             
                        with open(os.path.join(self.output_dir, "train_results.json"), "w") as f:
         | 
| 123 | 
             
                            json.dump(train_result.metrics, f, indent=2)
         | 
| 124 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 125 | 
             
                        logger.info("Training completed successfully!")
         | 
| 126 | 
             
                        logger.info(f"Training metrics: {train_result.metrics}")
         | 
| 127 |  | 
| 128 | 
             
                    except Exception as e:
         | 
| 129 | 
             
                        logger.error(f"Training failed: {e}")
         | 
|  | |
|  | |
|  | |
| 130 | 
             
                        raise
         | 
| 131 |  | 
| 132 | 
             
                def evaluate(self):
         | 
|  | |
| 11 | 
             
            from trl import SFTTrainer
         | 
| 12 | 
             
            import json
         | 
| 13 |  | 
| 14 | 
            +
            # Import monitoring
         | 
| 15 | 
            +
            from monitoring import create_monitor_from_config
         | 
| 16 | 
            +
             | 
| 17 | 
             
            logger = logging.getLogger(__name__)
         | 
| 18 |  | 
| 19 | 
             
            class SmolLM3Trainer:
         | 
|  | |
| 35 | 
             
                    self.init_from = init_from
         | 
| 36 | 
             
                    self.use_sft_trainer = use_sft_trainer
         | 
| 37 |  | 
| 38 | 
            +
                    # Initialize monitoring
         | 
| 39 | 
            +
                    self.monitor = create_monitor_from_config(config)
         | 
| 40 | 
            +
                    
         | 
| 41 | 
             
                    # Setup trainer
         | 
| 42 | 
             
                    self.trainer = self._setup_trainer()
         | 
| 43 |  | 
|  | |
| 61 | 
             
                    # Get data collator
         | 
| 62 | 
             
                    data_collator = self.dataset.get_data_collator()
         | 
| 63 |  | 
| 64 | 
            +
                    # Add monitoring callback
         | 
| 65 | 
            +
                    callbacks = []
         | 
| 66 | 
            +
                    if self.monitor and self.monitor.enable_tracking:
         | 
| 67 | 
            +
                        trackio_callback = self.monitor.create_monitoring_callback()
         | 
| 68 | 
            +
                        if trackio_callback:
         | 
| 69 | 
            +
                            callbacks.append(trackio_callback)
         | 
| 70 | 
            +
                    
         | 
| 71 | 
             
                    if self.use_sft_trainer:
         | 
| 72 | 
             
                        # Use SFTTrainer for supervised fine-tuning
         | 
| 73 | 
             
                        trainer = SFTTrainer(
         | 
|  | |
| 80 | 
             
                            dataset_text_field="text",
         | 
| 81 | 
             
                            max_seq_length=self.config.max_seq_length,
         | 
| 82 | 
             
                            packing=False,  # Disable packing for better control
         | 
| 83 | 
            +
                            callbacks=callbacks,
         | 
| 84 | 
             
                        )
         | 
| 85 | 
             
                    else:
         | 
| 86 | 
             
                        # Use standard Trainer
         | 
|  | |
| 91 | 
             
                            train_dataset=train_dataset,
         | 
| 92 | 
             
                            eval_dataset=eval_dataset,
         | 
| 93 | 
             
                            data_collator=data_collator,
         | 
| 94 | 
            +
                            callbacks=callbacks,
         | 
| 95 | 
             
                        )
         | 
| 96 |  | 
| 97 | 
             
                    return trainer
         | 
|  | |
| 118 | 
             
                    """Start training"""
         | 
| 119 | 
             
                    logger.info("Starting training")
         | 
| 120 |  | 
| 121 | 
            +
                    # Log configuration to Trackio
         | 
| 122 | 
            +
                    if self.monitor and self.monitor.enable_tracking:
         | 
| 123 | 
            +
                        config_dict = {k: v for k, v in self.config.__dict__.items() 
         | 
| 124 | 
            +
                                      if not k.startswith('_')}
         | 
| 125 | 
            +
                        self.monitor.log_config(config_dict)
         | 
| 126 | 
            +
                        
         | 
| 127 | 
            +
                        # Log experiment URL
         | 
| 128 | 
            +
                        experiment_url = self.monitor.get_experiment_url()
         | 
| 129 | 
            +
                        if experiment_url:
         | 
| 130 | 
            +
                            logger.info(f"Trackio experiment URL: {experiment_url}")
         | 
| 131 | 
            +
                    
         | 
| 132 | 
             
                    # Load checkpoint if resuming
         | 
| 133 | 
             
                    if self.init_from == "resume":
         | 
| 134 | 
             
                        checkpoint_path = "/input-checkpoint"
         | 
|  | |
| 148 | 
             
                        with open(os.path.join(self.output_dir, "train_results.json"), "w") as f:
         | 
| 149 | 
             
                            json.dump(train_result.metrics, f, indent=2)
         | 
| 150 |  | 
| 151 | 
            +
                        # Log training summary to Trackio
         | 
| 152 | 
            +
                        if self.monitor and self.monitor.enable_tracking:
         | 
| 153 | 
            +
                            summary = {
         | 
| 154 | 
            +
                                'final_loss': train_result.metrics.get('train_loss', 0),
         | 
| 155 | 
            +
                                'total_steps': train_result.metrics.get('train_runtime', 0),
         | 
| 156 | 
            +
                                'training_time': train_result.metrics.get('train_runtime', 0),
         | 
| 157 | 
            +
                                'output_dir': self.output_dir,
         | 
| 158 | 
            +
                                'model_name': getattr(self.config, 'model_name', 'unknown'),
         | 
| 159 | 
            +
                            }
         | 
| 160 | 
            +
                            self.monitor.log_training_summary(summary)
         | 
| 161 | 
            +
                            self.monitor.close()
         | 
| 162 | 
            +
                        
         | 
| 163 | 
             
                        logger.info("Training completed successfully!")
         | 
| 164 | 
             
                        logger.info(f"Training metrics: {train_result.metrics}")
         | 
| 165 |  | 
| 166 | 
             
                    except Exception as e:
         | 
| 167 | 
             
                        logger.error(f"Training failed: {e}")
         | 
| 168 | 
            +
                        # Close monitoring on error
         | 
| 169 | 
            +
                        if self.monitor and self.monitor.enable_tracking:
         | 
| 170 | 
            +
                            self.monitor.close()
         | 
| 171 | 
             
                        raise
         | 
| 172 |  | 
| 173 | 
             
                def evaluate(self):
         | 
