SmolFactory / scripts /trackio_tonic /configure_trackio.py
Tonic's picture
hide all tokens in logs, never persist to disk, remove max_seq_length from config, add to trainer
eb9e91f
raw
history blame
11.5 kB
#!/usr/bin/env python3
"""
Configuration script for Trackio environment variables
"""
import os
import json
import subprocess
from datetime import datetime
def get_username_from_token(token: str) -> str:
"""Get username from HF token with fallback to CLI"""
try:
# Try API first
from huggingface_hub import HfApi
api = HfApi(token=token)
user_info = api.whoami()
# Handle different possible response formats
if isinstance(user_info, dict):
# Try different possible keys for username
username = (
user_info.get('name') or
user_info.get('username') or
user_info.get('user') or
None
)
elif isinstance(user_info, str):
# If whoami returns just the username as string
username = user_info
else:
username = None
if username:
print(f"✅ Got username from API: {username}")
return username
else:
print("⚠️ Could not get username from API, trying CLI...")
return get_username_from_cli(token)
except Exception as e:
print(f"⚠️ API whoami failed: {e}")
print("⚠️ Trying CLI fallback...")
return get_username_from_cli(token)
def get_username_from_cli(token: str) -> str:
"""Fallback method to get username using CLI"""
try:
# Set HF token for CLI
os.environ['HF_TOKEN'] = token
# Get username using CLI
result = subprocess.run(
["hf", "whoami"],
capture_output=True,
text=True,
timeout=30
)
if result.returncode == 0:
username = result.stdout.strip()
if username:
print(f"✅ Got username from CLI: {username}")
return username
else:
print("⚠️ CLI returned empty username")
return None
else:
print(f"⚠️ CLI whoami failed: {result.stderr}")
return None
except Exception as e:
print(f"⚠️ CLI fallback failed: {e}")
return None
def configure_trackio():
"""Configure Trackio environment variables"""
print("🔧 Trackio Configuration")
print("=" * 40)
# Get HF token (single token approach)
hf_token = os.environ.get('HF_TOKEN')
# Use the single HF_TOKEN
active_token = hf_token
if active_token:
username = get_username_from_token(active_token)
if username:
print(f"✅ Authenticated as: {username}")
else:
print("⚠️ Could not determine username from token")
username = 'unknown'
else:
username = 'unknown'
# Use username in dataset repository if not specified
dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', f'{username}/trackio-experiments')
# Current configuration
# Never expose raw tokens in logs; only track presence
current_config = {
'HF_TOKEN': 'Set' if hf_token else 'Not set',
'TRACKIO_DATASET_REPO': dataset_repo,
'SPACE_ID': os.environ.get('SPACE_ID', 'Not set'),
'TRACKIO_URL': os.environ.get('TRACKIO_URL', 'Not set')
}
print("📋 Current Configuration:")
for key, value in current_config.items():
status = "✅" if value != "Not set" else "❌"
print(f" {status} {key}: {value}")
print("\n🎯 Configuration Options:")
print("1. Set HF_TOKEN - Main token (starts as write, switches to read after training)")
print("2. Set TRACKIO_DATASET_REPO - Dataset repository (optional)")
print("3. Set SPACE_ID - HF Space ID (auto-detected)")
print("4. Set TRACKIO_URL - Trackio Space URL (auto-detected)")
# Check if running on HF Spaces
if os.environ.get('SPACE_ID'):
print("\n🚀 Running on Hugging Face Spaces")
print(f" Space ID: {os.environ.get('SPACE_ID')}")
# Validate configuration
print("\n🔍 Configuration Validation:")
# Check HF_TOKEN
if current_config['HF_TOKEN'] != 'Not set':
print("✅ HF_TOKEN is set")
print(" This allows training operations and dataset access")
print(" Note: Token will be automatically switched from write to read after training")
else:
print("❌ HF_TOKEN is not set")
print(" Please set HF_TOKEN for training operations")
print(" Get your token from: https://huggingface.co/settings/tokens")
# Check dataset repository
print(f"📊 Dataset Repository: {dataset_repo}")
# Test dataset access if token is available
test_token = hf_token
if test_token:
print("\n🧪 Testing Dataset Access...")
try:
from datasets import load_dataset
from huggingface_hub import HfApi
# First check if the dataset repository exists
api = HfApi(token=test_token)
try:
# Try to get repository info
repo_info = api.repo_info(repo_id=dataset_repo, repo_type="dataset")
print(f"✅ Dataset repository exists: {dataset_repo}")
# Try to load the dataset
dataset = load_dataset(dataset_repo, token=test_token)
print(f"✅ Successfully loaded dataset: {dataset_repo}")
# Show experiment count
if 'train' in dataset:
experiment_count = len(dataset['train'])
print(f"📈 Found {experiment_count} experiments in dataset")
# Show sample experiments
if experiment_count > 0:
print("🔬 Sample experiments:")
for i, row in enumerate(dataset['train'][:3]): # Show first 3
exp_id = row.get('experiment_id', 'Unknown')
name = row.get('name', 'Unnamed')
print(f" {i+1}. {exp_id}: {name}")
except Exception as repo_error:
if "404" in str(repo_error) or "not found" in str(repo_error).lower():
print(f"⚠️ Dataset repository '{dataset_repo}' doesn't exist yet")
print(" This is normal if you haven't created the dataset yet")
print(" Run setup_hf_dataset.py to create the dataset")
else:
print(f"❌ Error accessing dataset repository: {repo_error}")
print(" Check that your token has read permissions")
except ImportError:
print("❌ Required packages not available")
print(" Install with: pip install datasets huggingface_hub")
except Exception as e:
print(f"❌ Failed to load dataset: {e}")
print(" This might be normal if the dataset doesn't exist yet")
print(" Run setup_hf_dataset.py to create the dataset")
else:
print("\n🧪 Dataset Access Test:")
print("❌ Cannot test dataset access - no valid token set")
# Generate configuration file
config_file = "trackio_config.json"
# Do not persist raw tokens to disk; store only presence flag
config_data = {
'hf_token_set': bool(hf_token),
'dataset_repo': current_config['TRACKIO_DATASET_REPO'],
'space_id': current_config['SPACE_ID'],
'trackio_url': current_config['TRACKIO_URL'],
'username': username,
'last_updated': datetime.now().isoformat(),
'notes': 'Trackio configuration - HF_TOKEN starts as write token, switches to read token after training'
}
with open(config_file, 'w') as f:
json.dump(config_data, f, indent=2)
print(f"\n💾 Configuration saved to: {config_file}")
# Show environment variable commands
print("\n📝 Environment Variables for HF Space:")
print("=" * 50)
print(f"HF_TOKEN={'Set' if hf_token else 'Not set'}")
print(f"TRACKIO_DATASET_REPO={current_config['TRACKIO_DATASET_REPO']}")
if current_config['TRACKIO_URL'] != 'Not set':
print(f"TRACKIO_URL={current_config['TRACKIO_URL']}")
print("\n🎯 Next Steps:")
print("1. HF_TOKEN will be automatically set during deployment (starts as write token)")
print("2. HF_TOKEN will be automatically switched to read token after training")
print("3. Optionally set TRACKIO_DATASET_REPO to use a different dataset")
print("4. Deploy your updated app.py to the Space")
print("5. Run setup_hf_dataset.py if you haven't created the dataset yet")
print("\n📚 Usage Examples")
print("=" * 30)
print("1. Default Dataset")
print(f" Repository: {username}/trackio-experiments")
print(" Description: Default dataset for your experiments")
print(f" Set with: TRACKIO_DATASET_REPO={username}/trackio-experiments")
print()
print("2. Personal Dataset")
print(f" Repository: {username}/trackio-experiments")
print(" Description: Your personal experiment dataset")
print(f" Set with: TRACKIO_DATASET_REPO={username}/trackio-experiments")
print()
print("3. Team Dataset")
print(" Repository: your-org/team-experiments")
print(" Description: Shared dataset for team experiments")
print(" Set with: TRACKIO_DATASET_REPO=your-org/team-experiments")
print()
print("4. Project Dataset")
print(f" Repository: {username}/smollm3-experiments")
print(" Description: Dataset specific to SmolLM3 experiments")
print(f" Set with: TRACKIO_DATASET_REPO={username}/smollm3-experiments")
def show_usage_examples():
"""Show usage examples for different dataset configurations"""
examples = [
{
'name': 'Default Dataset',
'repo': 'your-username/trackio-experiments',
'description': 'Default dataset for your experiments',
'env_var': 'TRACKIO_DATASET_REPO=your-username/trackio-experiments'
},
{
'name': 'Personal Dataset',
'repo': 'your-username/trackio-experiments',
'description': 'Your personal experiment dataset',
'env_var': 'TRACKIO_DATASET_REPO=your-username/trackio-experiments'
},
{
'name': 'Team Dataset',
'repo': 'your-org/team-experiments',
'description': 'Shared dataset for team experiments',
'env_var': 'TRACKIO_DATASET_REPO=your-org/team-experiments'
},
{
'name': 'Project Dataset',
'repo': 'your-username/smollm3-experiments',
'description': 'Dataset specific to SmolLM3 experiments',
'env_var': 'TRACKIO_DATASET_REPO=your-username/smollm3-experiments'
}
]
print("\n📚 Usage Examples")
print("=" * 30)
for i, example in enumerate(examples, 1):
print(f"{i}. {example['name']}")
print(f" Repository: {example['repo']}")
print(f" Description: {example['description']}")
print(f" Set with: {example['env_var']}")
print()
if __name__ == "__main__":
configure_trackio()