Spaces:

Tonic
/

SmolFactory

Running

App Files Files Community

SmolFactory / scripts /trackio_tonic /configure_trackio.py

Tonic

hide all tokens in logs, never persist to disk, remove max_seq_length from config, add to trainer

eb9e91f about 1 month ago

raw

history blame

11.5 kB

	#!/usr/bin/env python3
	"""
	Configuration script for Trackio environment variables
	"""

	import os
	import json
	import subprocess
	from datetime import datetime

	def get_username_from_token(token: str) -> str:
	"""Get username from HF token with fallback to CLI"""
	try:
	# Try API first
	from huggingface_hub import HfApi
	api = HfApi(token=token)
	user_info = api.whoami()

	# Handle different possible response formats
	if isinstance(user_info, dict):
	# Try different possible keys for username
	username = (
	user_info.get('name') or
	user_info.get('username') or
	user_info.get('user') or
	None
	)
	elif isinstance(user_info, str):
	# If whoami returns just the username as string
	username = user_info
	else:
	username = None

	if username:
	print(f"✅ Got username from API: {username}")
	return username
	else:
	print("⚠️ Could not get username from API, trying CLI...")
	return get_username_from_cli(token)

	except Exception as e:
	print(f"⚠️ API whoami failed: {e}")
	print("⚠️ Trying CLI fallback...")
	return get_username_from_cli(token)

	def get_username_from_cli(token: str) -> str:
	"""Fallback method to get username using CLI"""
	try:
	# Set HF token for CLI
	os.environ['HF_TOKEN'] = token

	# Get username using CLI
	result = subprocess.run(
	["hf", "whoami"],
	capture_output=True,
	text=True,
	timeout=30
	)

	if result.returncode == 0:
	username = result.stdout.strip()
	if username:
	print(f"✅ Got username from CLI: {username}")
	return username
	else:
	print("⚠️ CLI returned empty username")
	return None
	else:
	print(f"⚠️ CLI whoami failed: {result.stderr}")
	return None

	except Exception as e:
	print(f"⚠️ CLI fallback failed: {e}")
	return None

	def configure_trackio():
	"""Configure Trackio environment variables"""

	print("🔧 Trackio Configuration")
	print("=" * 40)

	# Get HF token (single token approach)
	hf_token = os.environ.get('HF_TOKEN')

	# Use the single HF_TOKEN
	active_token = hf_token

	if active_token:
	username = get_username_from_token(active_token)
	if username:
	print(f"✅ Authenticated as: {username}")
	else:
	print("⚠️ Could not determine username from token")
	username = 'unknown'
	else:
	username = 'unknown'

	# Use username in dataset repository if not specified
	dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', f'{username}/trackio-experiments')

	# Current configuration
	# Never expose raw tokens in logs; only track presence
	current_config = {
	'HF_TOKEN': 'Set' if hf_token else 'Not set',
	'TRACKIO_DATASET_REPO': dataset_repo,
	'SPACE_ID': os.environ.get('SPACE_ID', 'Not set'),
	'TRACKIO_URL': os.environ.get('TRACKIO_URL', 'Not set')
	}

	print("📋 Current Configuration:")
	for key, value in current_config.items():
	status = "✅" if value != "Not set" else "❌"
	print(f" {status} {key}: {value}")

	print("\n🎯 Configuration Options:")
	print("1. Set HF_TOKEN - Main token (starts as write, switches to read after training)")
	print("2. Set TRACKIO_DATASET_REPO - Dataset repository (optional)")
	print("3. Set SPACE_ID - HF Space ID (auto-detected)")
	print("4. Set TRACKIO_URL - Trackio Space URL (auto-detected)")

	# Check if running on HF Spaces
	if os.environ.get('SPACE_ID'):
	print("\n🚀 Running on Hugging Face Spaces")
	print(f" Space ID: {os.environ.get('SPACE_ID')}")

	# Validate configuration
	print("\n🔍 Configuration Validation:")

	# Check HF_TOKEN
	if current_config['HF_TOKEN'] != 'Not set':
	print("✅ HF_TOKEN is set")
	print(" This allows training operations and dataset access")
	print(" Note: Token will be automatically switched from write to read after training")
	else:
	print("❌ HF_TOKEN is not set")
	print(" Please set HF_TOKEN for training operations")
	print(" Get your token from: https://huggingface.co/settings/tokens")

	# Check dataset repository
	print(f"📊 Dataset Repository: {dataset_repo}")

	# Test dataset access if token is available
	test_token = hf_token
	if test_token:
	print("\n🧪 Testing Dataset Access...")
	try:
	from datasets import load_dataset
	from huggingface_hub import HfApi

	# First check if the dataset repository exists
	api = HfApi(token=test_token)

	try:
	# Try to get repository info
	repo_info = api.repo_info(repo_id=dataset_repo, repo_type="dataset")
	print(f"✅ Dataset repository exists: {dataset_repo}")

	# Try to load the dataset
	dataset = load_dataset(dataset_repo, token=test_token)
	print(f"✅ Successfully loaded dataset: {dataset_repo}")

	# Show experiment count
	if 'train' in dataset:
	experiment_count = len(dataset['train'])
	print(f"📈 Found {experiment_count} experiments in dataset")

	# Show sample experiments
	if experiment_count > 0:
	print("🔬 Sample experiments:")
	for i, row in enumerate(dataset['train'][:3]): # Show first 3
	exp_id = row.get('experiment_id', 'Unknown')
	name = row.get('name', 'Unnamed')
	print(f" {i+1}. {exp_id}: {name}")

	except Exception as repo_error:
	if "404" in str(repo_error) or "not found" in str(repo_error).lower():
	print(f"⚠️ Dataset repository '{dataset_repo}' doesn't exist yet")
	print(" This is normal if you haven't created the dataset yet")
	print(" Run setup_hf_dataset.py to create the dataset")
	else:
	print(f"❌ Error accessing dataset repository: {repo_error}")
	print(" Check that your token has read permissions")

	except ImportError:
	print("❌ Required packages not available")
	print(" Install with: pip install datasets huggingface_hub")
	except Exception as e:
	print(f"❌ Failed to load dataset: {e}")
	print(" This might be normal if the dataset doesn't exist yet")
	print(" Run setup_hf_dataset.py to create the dataset")
	else:
	print("\n🧪 Dataset Access Test:")
	print("❌ Cannot test dataset access - no valid token set")

	# Generate configuration file
	config_file = "trackio_config.json"
	# Do not persist raw tokens to disk; store only presence flag
	config_data = {
	'hf_token_set': bool(hf_token),
	'dataset_repo': current_config['TRACKIO_DATASET_REPO'],
	'space_id': current_config['SPACE_ID'],
	'trackio_url': current_config['TRACKIO_URL'],
	'username': username,
	'last_updated': datetime.now().isoformat(),
	'notes': 'Trackio configuration - HF_TOKEN starts as write token, switches to read token after training'
	}

	with open(config_file, 'w') as f:
	json.dump(config_data, f, indent=2)

	print(f"\n💾 Configuration saved to: {config_file}")

	# Show environment variable commands
	print("\n📝 Environment Variables for HF Space:")
	print("=" * 50)
	print(f"HF_TOKEN={'Set' if hf_token else 'Not set'}")
	print(f"TRACKIO_DATASET_REPO={current_config['TRACKIO_DATASET_REPO']}")
	if current_config['TRACKIO_URL'] != 'Not set':
	print(f"TRACKIO_URL={current_config['TRACKIO_URL']}")

	print("\n🎯 Next Steps:")
	print("1. HF_TOKEN will be automatically set during deployment (starts as write token)")
	print("2. HF_TOKEN will be automatically switched to read token after training")
	print("3. Optionally set TRACKIO_DATASET_REPO to use a different dataset")
	print("4. Deploy your updated app.py to the Space")
	print("5. Run setup_hf_dataset.py if you haven't created the dataset yet")

	print("\n📚 Usage Examples")
	print("=" * 30)
	print("1. Default Dataset")
	print(f" Repository: {username}/trackio-experiments")
	print(" Description: Default dataset for your experiments")
	print(f" Set with: TRACKIO_DATASET_REPO={username}/trackio-experiments")
	print()
	print("2. Personal Dataset")
	print(f" Repository: {username}/trackio-experiments")
	print(" Description: Your personal experiment dataset")
	print(f" Set with: TRACKIO_DATASET_REPO={username}/trackio-experiments")
	print()
	print("3. Team Dataset")
	print(" Repository: your-org/team-experiments")
	print(" Description: Shared dataset for team experiments")
	print(" Set with: TRACKIO_DATASET_REPO=your-org/team-experiments")
	print()
	print("4. Project Dataset")
	print(f" Repository: {username}/smollm3-experiments")
	print(" Description: Dataset specific to SmolLM3 experiments")
	print(f" Set with: TRACKIO_DATASET_REPO={username}/smollm3-experiments")

	def show_usage_examples():
	"""Show usage examples for different dataset configurations"""
	examples = [
	{
	'name': 'Default Dataset',
	'repo': 'your-username/trackio-experiments',
	'description': 'Default dataset for your experiments',
	'env_var': 'TRACKIO_DATASET_REPO=your-username/trackio-experiments'
	},
	{
	'name': 'Personal Dataset',
	'repo': 'your-username/trackio-experiments',
	'description': 'Your personal experiment dataset',
	'env_var': 'TRACKIO_DATASET_REPO=your-username/trackio-experiments'
	},
	{
	'name': 'Team Dataset',
	'repo': 'your-org/team-experiments',
	'description': 'Shared dataset for team experiments',
	'env_var': 'TRACKIO_DATASET_REPO=your-org/team-experiments'
	},
	{
	'name': 'Project Dataset',
	'repo': 'your-username/smollm3-experiments',
	'description': 'Dataset specific to SmolLM3 experiments',
	'env_var': 'TRACKIO_DATASET_REPO=your-username/smollm3-experiments'
	}
	]

	print("\n📚 Usage Examples")
	print("=" * 30)

	for i, example in enumerate(examples, 1):
	print(f"{i}. {example['name']}")
	print(f" Repository: {example['repo']}")
	print(f" Description: {example['description']}")
	print(f" Set with: {example['env_var']}")
	print()

	if __name__ == "__main__":
	configure_trackio()