Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	| #!/usr/bin/env python3 | |
| """ | |
| Setup script for Hugging Face Dataset repository for Trackio experiments | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import time | |
| from datetime import datetime | |
| from pathlib import Path | |
| from datasets import Dataset | |
| from typing import Optional, Dict, Any | |
| from huggingface_hub import HfApi, create_repo | |
| import subprocess | |
| def get_username_from_token(token: str) -> Optional[str]: | |
| """ | |
| Get username from HF token using the API. | |
| Args: | |
| token (str): Hugging Face token | |
| Returns: | |
| Optional[str]: Username if successful, None otherwise | |
| """ | |
| try: | |
| # Create API client with token directly | |
| api = HfApi(token=token) | |
| # Get user info | |
| user_info = api.whoami() | |
| username = user_info.get("name", user_info.get("username")) | |
| return username | |
| except Exception as e: | |
| print(f"❌ Error getting username from token: {e}") | |
| return None | |
| def create_dataset_repository(username: str, dataset_name: str = "trackio-experiments", token: str = None) -> str: | |
| """ | |
| Create a dataset repository on Hugging Face. | |
| Args: | |
| username (str): HF username | |
| dataset_name (str): Name for the dataset repository | |
| token (str): HF token for authentication | |
| Returns: | |
| str: Full repository name (username/dataset_name) | |
| """ | |
| repo_id = f"{username}/{dataset_name}" | |
| try: | |
| # Create the dataset repository | |
| create_repo( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token, | |
| exist_ok=True, | |
| private=False # Public dataset for easier sharing | |
| ) | |
| print(f"✅ Successfully created dataset repository: {repo_id}") | |
| return repo_id | |
| except Exception as e: | |
| if "already exists" in str(e).lower(): | |
| print(f"ℹ️ Dataset repository already exists: {repo_id}") | |
| return repo_id | |
| else: | |
| print(f"❌ Error creating dataset repository: {e}") | |
| return None | |
| def setup_trackio_dataset(dataset_name: str = None, token: str = None) -> bool: | |
| """ | |
| Set up Trackio dataset repository automatically. | |
| Args: | |
| dataset_name (str): Optional custom dataset name (default: trackio-experiments) | |
| token (str): HF token for authentication | |
| Returns: | |
| bool: True if successful, False otherwise | |
| """ | |
| print("🚀 Setting up Trackio Dataset Repository") | |
| print("=" * 50) | |
| # Get token from parameter, environment, or command line | |
| if not token: | |
| token = os.environ.get('HUGGING_FACE_HUB_TOKEN') or os.environ.get('HF_TOKEN') | |
| # If no token in environment, try command line argument | |
| if not token and len(sys.argv) > 1: | |
| token = sys.argv[1] | |
| if not token: | |
| print("❌ No HF token found. Please set HUGGING_FACE_HUB_TOKEN environment variable or provide as argument.") | |
| return False | |
| # Get username from token | |
| print("🔍 Getting username from token...") | |
| username = get_username_from_token(token) | |
| if not username: | |
| print("❌ Could not determine username from token. Please check your token.") | |
| return False | |
| print(f"✅ Authenticated as: {username}") | |
| # Use provided dataset name or default | |
| if not dataset_name: | |
| dataset_name = "trackio-experiments" | |
| # Create dataset repository | |
| print(f"🔧 Creating dataset repository: {username}/{dataset_name}") | |
| repo_id = create_dataset_repository(username, dataset_name, token) | |
| if not repo_id: | |
| print("❌ Failed to create dataset repository") | |
| return False | |
| # Set environment variable for other scripts | |
| os.environ['TRACKIO_DATASET_REPO'] = repo_id | |
| print(f"✅ Set TRACKIO_DATASET_REPO={repo_id}") | |
| # Add initial experiment data | |
| print("📊 Adding initial experiment data...") | |
| if add_initial_experiment_data(repo_id, token): | |
| print("✅ Successfully added initial experiment data") | |
| else: | |
| print("⚠️ Could not add initial experiment data (this is optional)") | |
| # Add dataset README | |
| print("📝 Adding dataset README...") | |
| if add_dataset_readme(repo_id, token): | |
| print("✅ Successfully added dataset README") | |
| else: | |
| print("⚠️ Could not add dataset README (this is optional)") | |
| print(f"\n🎉 Dataset setup complete!") | |
| print(f"📊 Dataset URL: https://huggingface.co/datasets/{repo_id}") | |
| print(f"🔧 Repository ID: {repo_id}") | |
| return True | |
| def add_initial_experiment_data(repo_id: str, token: str = None) -> bool: | |
| """ | |
| Add initial experiment data to the dataset using data preservation. | |
| Args: | |
| repo_id (str): Dataset repository ID | |
| token (str): HF token for authentication | |
| Returns: | |
| bool: True if successful, False otherwise | |
| """ | |
| try: | |
| # Get token from parameter or environment | |
| if not token: | |
| token = os.environ.get('HUGGING_FACE_HUB_TOKEN') or os.environ.get('HF_TOKEN') | |
| if not token: | |
| print("⚠️ No token available for uploading data") | |
| return False | |
| # Import dataset manager | |
| import sys | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) | |
| from dataset_utils import TrackioDatasetManager | |
| # Initialize dataset manager | |
| dataset_manager = TrackioDatasetManager(repo_id, token) | |
| # Check if dataset already has data | |
| existing_experiments = dataset_manager.load_existing_experiments() | |
| if existing_experiments: | |
| print(f"ℹ️ Dataset already contains {len(existing_experiments)} experiments, preserving existing data") | |
| # Initial experiment data | |
| initial_experiment = { | |
| 'experiment_id': f'exp_demo_{datetime.now().strftime("%Y%m%d_%H%M%S")}', | |
| 'name': 'smollm3-finetune-demo', | |
| 'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking', | |
| 'created_at': datetime.now().isoformat(), | |
| 'status': 'completed', | |
| 'metrics': json.dumps([ | |
| { | |
| 'timestamp': datetime.now().isoformat(), | |
| 'step': 100, | |
| 'metrics': { | |
| 'loss': 1.15, | |
| 'grad_norm': 10.5, | |
| 'learning_rate': 5e-6, | |
| 'num_tokens': 1000000.0, | |
| 'mean_token_accuracy': 0.76, | |
| 'epoch': 0.1, | |
| 'total_tokens': 1000000.0, | |
| 'throughput': 2000000.0, | |
| 'step_time': 0.5, | |
| 'batch_size': 2, | |
| 'seq_len': 4096, | |
| 'token_acc': 0.76, | |
| 'gpu_memory_allocated': 15.2, | |
| 'gpu_memory_reserved': 70.1, | |
| 'gpu_utilization': 85.2, | |
| 'cpu_percent': 2.7, | |
| 'memory_percent': 10.1 | |
| } | |
| } | |
| ]), | |
| 'parameters': json.dumps({ | |
| 'model_name': 'HuggingFaceTB/SmolLM3-3B', | |
| 'max_seq_length': 4096, | |
| 'batch_size': 2, | |
| 'learning_rate': 5e-6, | |
| 'epochs': 3, | |
| 'dataset': 'OpenHermes-FR', | |
| 'trainer_type': 'SFTTrainer', | |
| 'hardware': 'GPU (H100/A100)', | |
| 'mixed_precision': True, | |
| 'gradient_checkpointing': True, | |
| 'flash_attention': True | |
| }), | |
| 'artifacts': json.dumps([]), | |
| 'logs': json.dumps([ | |
| { | |
| 'timestamp': datetime.now().isoformat(), | |
| 'level': 'INFO', | |
| 'message': 'Training started successfully' | |
| }, | |
| { | |
| 'timestamp': datetime.now().isoformat(), | |
| 'level': 'INFO', | |
| 'message': 'Model loaded and configured' | |
| }, | |
| { | |
| 'timestamp': datetime.now().isoformat(), | |
| 'level': 'INFO', | |
| 'message': 'Dataset loaded and preprocessed' | |
| } | |
| ]), | |
| 'last_updated': datetime.now().isoformat() | |
| } | |
| # Use dataset manager to safely add the experiment | |
| success = dataset_manager.upsert_experiment(initial_experiment) | |
| if success: | |
| print(f"✅ Successfully added initial experiment data to {repo_id}") | |
| final_count = len(dataset_manager.load_existing_experiments()) | |
| print(f"📊 Dataset now contains {final_count} total experiments") | |
| else: | |
| print(f"❌ Failed to add initial experiment data to {repo_id}") | |
| return False | |
| # Add README template | |
| add_dataset_readme(repo_id, token) | |
| return True | |
| except Exception as e: | |
| print(f"⚠️ Could not add initial experiment data: {e}") | |
| return False | |
| def add_dataset_readme(repo_id: str, token: str) -> bool: | |
| """ | |
| Add README template to the dataset repository. | |
| Args: | |
| repo_id (str): Dataset repository ID | |
| token (str): HF token | |
| Returns: | |
| bool: True if successful, False otherwise | |
| """ | |
| try: | |
| # Read the README template | |
| template_path = os.path.join(os.path.dirname(__file__), '..', '..', 'templates', 'datasets', 'readme.md') | |
| if os.path.exists(template_path): | |
| with open(template_path, 'r', encoding='utf-8') as f: | |
| readme_content = f.read() | |
| else: | |
| # Create a basic README if template doesn't exist | |
| readme_content = f"""--- | |
| dataset_info: | |
| features: | |
| - name: experiment_id | |
| dtype: string | |
| - name: name | |
| dtype: string | |
| - name: description | |
| dtype: string | |
| - name: created_at | |
| dtype: string | |
| - name: status | |
| dtype: string | |
| - name: metrics | |
| dtype: string | |
| - name: parameters | |
| dtype: string | |
| - name: artifacts | |
| dtype: string | |
| - name: logs | |
| dtype: string | |
| - name: last_updated | |
| dtype: string | |
| tags: | |
| - trackio | |
| - experiment tracking | |
| - smollm3 | |
| - fine-tuning | |
| --- | |
| # Trackio Experiments Dataset | |
| This dataset stores experiment tracking data for ML training runs, particularly focused on SmolLM3 fine-tuning experiments with comprehensive metrics tracking. | |
| ## Dataset Structure | |
| The dataset contains the following columns: | |
| - **experiment_id**: Unique identifier for each experiment | |
| - **name**: Human-readable name for the experiment | |
| - **description**: Detailed description of the experiment | |
| - **created_at**: Timestamp when the experiment was created | |
| - **status**: Current status (running, completed, failed, paused) | |
| - **metrics**: JSON string containing training metrics over time | |
| - **parameters**: JSON string containing experiment configuration | |
| - **artifacts**: JSON string containing experiment artifacts | |
| - **logs**: JSON string containing experiment logs | |
| - **last_updated**: Timestamp of last update | |
| ## Usage | |
| This dataset is automatically used by the Trackio monitoring system to store and retrieve experiment data. It provides persistent storage for experiment tracking across different training runs. | |
| ## Integration | |
| The dataset is used by: | |
| - Trackio Spaces for experiment visualization | |
| - Training scripts for logging metrics and parameters | |
| - Monitoring systems for experiment tracking | |
| - SmolLM3 fine-tuning pipeline for comprehensive metrics capture | |
| ## Privacy | |
| This dataset is public by default for easier sharing and collaboration. Only non-sensitive experiment data is stored. | |
| ## Examples | |
| ### Sample Experiment Entry | |
| ```json | |
| {{ | |
| "experiment_id": "exp_20250720_130853", | |
| "name": "smollm3-finetune-demo", | |
| "description": "SmolLM3 fine-tuning experiment demo", | |
| "created_at": "2025-07-20T13:08:53", | |
| "status": "completed", | |
| "metrics": "{{...}}", | |
| "parameters": "{{...}}", | |
| "artifacts": "[]", | |
| "logs": "{{...}}", | |
| "last_updated": "2025-07-20T13:08:53" | |
| }} | |
| ``` | |
| This dataset is maintained by the Trackio monitoring system and automatically updated during training runs. | |
| """ | |
| # Upload README to the dataset repository | |
| from huggingface_hub import upload_file | |
| # Create a temporary file with the README content | |
| import tempfile | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f: | |
| f.write(readme_content) | |
| temp_file = f.name | |
| try: | |
| upload_file( | |
| path_or_fileobj=temp_file, | |
| path_in_repo="README.md", | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token, | |
| commit_message="Add dataset README" | |
| ) | |
| print(f"✅ Successfully added README to {repo_id}") | |
| return True | |
| finally: | |
| # Clean up temporary file | |
| if os.path.exists(temp_file): | |
| os.unlink(temp_file) | |
| except Exception as e: | |
| print(f"⚠️ Could not add README to dataset: {e}") | |
| return False | |
| def main(): | |
| """Main function to set up the dataset.""" | |
| # Get token from environment first | |
| token = os.environ.get('HUGGING_FACE_HUB_TOKEN') or os.environ.get('HF_TOKEN') | |
| # If no token in environment, try command line argument | |
| if not token and len(sys.argv) > 1: | |
| token = sys.argv[1] | |
| if not token: | |
| print("❌ No HF token found. Please set HUGGING_FACE_HUB_TOKEN environment variable or provide as argument.") | |
| sys.exit(1) | |
| # Get dataset name from command line or use default | |
| dataset_name = None | |
| if len(sys.argv) > 2: | |
| dataset_name = sys.argv[2] | |
| # Pass token to setup function | |
| success = setup_trackio_dataset(dataset_name, token) | |
| sys.exit(0 if success else 1) | |
| if __name__ == "__main__": | |
| main() | 
