Spaces:

Tonic
/

VoxFactory

Running

App Files Files Community

Joseph Pollack commited on Sep 3

Commit

676b3f3

unverified ·

1 Parent(s): b9f51a0

adds automatic authentication , dataset readme , push to hub automation , demo , readme , and interface improvements

Browse files

Files changed (7) hide show

interface.py +120 -3
requirements.txt +3 -1
scripts/push_to_huggingface.py +223 -175
scripts/train.py +189 -1
scripts/train_lora.py +199 -2
templates/datasets/readme.md +0 -171
tests/test_hf_setup.py +141 -0

interface.py CHANGED Viewed

@@ -155,6 +155,104 @@ def _save_uploaded_dataset(files: list, transcripts: list[str]) -> str:
     return str(jsonl_path)
 def _save_recordings(recordings: list[tuple[int, list]], transcripts: list[str]) -> str:
     import soundfile as sf
     dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
@@ -231,6 +329,7 @@ def start_voxtral_training(
         repo_name = f"{username}/{repo_short}" if username else repo_short
         push_args = [
             str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
             str(output_dir),
             repo_name,
         ]
@@ -519,6 +618,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
             gr.update(visible=True),  # dataset_status
             gr.update(visible=True),  # advanced_accordion
             gr.update(visible=True),  # save_rec_btn
             gr.update(visible=True),  # start_btn
             gr.update(visible=True),  # logs_box
         ]
@@ -607,17 +707,27 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
         gr.Markdown("### Upload audio + transcripts (optional)")
         upload_audio = gr.File(file_count="multiple", type="filepath", label="Upload WAV/FLAC files (optional)")
         transcripts_box = gr.Textbox(lines=6, label="Transcripts (one per line, aligned with files)")
         save_upload_btn = gr.Button("Save uploaded dataset")
         def _collect_upload(files, txt):
             lines = [s.strip() for s in (txt or "").splitlines() if s.strip()]
-            return _save_uploaded_dataset(files or [], lines)
-        # Removed - no longer needed since jsonl_out was removed
-        # save_upload_btn.click(_collect_upload, [upload_audio, transcripts_box], [])
     # Save recordings button
     save_rec_btn = gr.Button("Save recordings as dataset", visible=False)
     def _collect_preloaded_recs(*recs_and_texts):
         import soundfile as sf
@@ -646,6 +756,13 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
     save_rec_btn.click(_collect_preloaded_recs, rec_components + [phrase_texts_state], [jsonl_path_state])
     # Removed multilingual dataset sample section - phrases are now loaded automatically when language is selected
     start_btn = gr.Button("Start Fine-tuning", visible=False)

     return str(jsonl_path)
+def _push_dataset_to_hub(jsonl_path: str, repo_name: str, username: str = "") -> str:
+    """Push dataset to Hugging Face Hub"""
+    try:
+        from huggingface_hub import HfApi, create_repo
+        import json
+        from pathlib import Path
+        token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+        if not token:
+            return "❌ No HF_TOKEN found. Set HF_TOKEN environment variable to push datasets."
+        api = HfApi(token=token)
+        # Determine full repo name
+        if "/" not in repo_name:
+            if not username:
+                user_info = api.whoami()
+                username = user_info.get("name") or user_info.get("username") or ""
+            if username:
+                repo_name = f"{username}/{repo_name}"
+        # Create dataset repository
+        try:
+            create_repo(repo_name, repo_type="dataset", token=token, exist_ok=True)
+        except Exception as e:
+            if "already exists" not in str(e).lower():
+                return f"❌ Failed to create dataset repo: {e}"
+        # Read the JSONL file
+        jsonl_file = Path(jsonl_path)
+        if not jsonl_file.exists():
+            return f"❌ Dataset file not found: {jsonl_path}"
+        # Upload the JSONL file
+        api.upload_file(
+            path_or_fileobj=str(jsonl_file),
+            path_in_repo="data.jsonl",
+            repo_id=repo_name,
+            repo_type="dataset",
+            token=token
+        )
+        # Create a simple README for the dataset
+        readme_content = f"""---
+dataset_info:
+  features:
+    - name: audio_path
+      dtype: string
+    - name: text
+      dtype: string
+  splits:
+    - name: train
+      num_bytes: {jsonl_file.stat().st_size}
+      num_examples: {sum(1 for _ in open(jsonl_file))}
+  download_size: {jsonl_file.stat().st_size}
+  dataset_size: {jsonl_file.stat().st_size}
+---
+# Voxtral ASR Dataset
+This dataset was created using the Voxtral ASR Fine-tuning Interface.
+## Dataset Structure
+- **audio_path**: Path to the audio file
+- **text**: Transcription of the audio
+## Usage
+```python
+from datasets import load_dataset
+dataset = load_dataset("{repo_name}")
+```
+"""
+        # Upload README
+        readme_path = jsonl_file.parent / "README.md"
+        with open(readme_path, "w") as f:
+            f.write(readme_content)
+        api.upload_file(
+            path_or_fileobj=str(readme_path),
+            path_in_repo="README.md",
+            repo_id=repo_name,
+            repo_type="dataset",
+            token=token
+        )
+        readme_path.unlink()  # Clean up temp file
+        return f"✅ Dataset pushed to: https://huggingface.co/datasets/{repo_name}"
+    except Exception as e:
+        return f"❌ Failed to push dataset: {e}"
 def _save_recordings(recordings: list[tuple[int, list]], transcripts: list[str]) -> str:
     import soundfile as sf
     dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
         repo_name = f"{username}/{repo_short}" if username else repo_short
         push_args = [
             str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
+            "model",
             str(output_dir),
             repo_name,
         ]
             gr.update(visible=True),  # dataset_status
             gr.update(visible=True),  # advanced_accordion
             gr.update(visible=True),  # save_rec_btn
+            gr.update(visible=True),  # push_recordings_btn
             gr.update(visible=True),  # start_btn
             gr.update(visible=True),  # logs_box
         ]
         gr.Markdown("### Upload audio + transcripts (optional)")
         upload_audio = gr.File(file_count="multiple", type="filepath", label="Upload WAV/FLAC files (optional)")
         transcripts_box = gr.Textbox(lines=6, label="Transcripts (one per line, aligned with files)")
+        dataset_repo_name = gr.Textbox(value=f"voxtral-dataset-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+                                       label="Dataset repo name (will be pushed to HF Hub)")
         save_upload_btn = gr.Button("Save uploaded dataset")
+        push_dataset_btn = gr.Button("Push dataset to HF Hub")
         def _collect_upload(files, txt):
             lines = [s.strip() for s in (txt or "").splitlines() if s.strip()]
+            jsonl_path = _save_uploaded_dataset(files or [], lines)
+            return f"✅ Dataset saved locally: {jsonl_path}"
+        def _push_dataset_handler(repo_name):
+            if not jsonl_path_state.value:
+                return "❌ No dataset saved yet. Please save dataset first."
+            return _push_dataset_to_hub(jsonl_path_state.value, repo_name)
+        save_upload_btn.click(_collect_upload, [upload_audio, transcripts_box], [jsonl_path_state])
+        push_dataset_btn.click(_push_dataset_handler, [dataset_repo_name], [jsonl_path_state])
     # Save recordings button
     save_rec_btn = gr.Button("Save recordings as dataset", visible=False)
+    push_recordings_btn = gr.Button("Push recordings dataset to HF Hub", visible=False)
     def _collect_preloaded_recs(*recs_and_texts):
         import soundfile as sf
     save_rec_btn.click(_collect_preloaded_recs, rec_components + [phrase_texts_state], [jsonl_path_state])
+    def _push_recordings_handler(repo_name):
+        if not jsonl_path_state.value:
+            return "❌ No recordings dataset saved yet. Please save recordings first."
+        return _push_dataset_to_hub(jsonl_path_state.value, repo_name)
+    push_recordings_btn.click(_push_recordings_handler, [dataset_repo_name], [jsonl_path_state])
     # Removed multilingual dataset sample section - phrases are now loaded automatically when language is selected
     start_btn = gr.Button("Start Fine-tuning", visible=False)

requirements.txt CHANGED Viewed

@@ -2,4 +2,6 @@ torch
 datasets
 peft
 transformers
-gradio

 datasets
 peft
 transformers
+gradio
+trackio
+huggingface_hub

scripts/push_to_huggingface.py CHANGED Viewed

@@ -1,20 +1,26 @@
 #!/usr/bin/env python3
 """
-Push Trained Model and Results to Hugging Face Hub
-Integrates with Trackio monitoring and HF Datasets for complete model deployment
 """
 import os
 import json
 import argparse
 import logging
-import time
 from pathlib import Path
-from typing import Dict, Any, Optional, List
 from datetime import datetime
-import subprocess
-import shutil
-import platform
 # Set timeout for HF operations to prevent hanging
 os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
@@ -22,34 +28,15 @@ os.environ['HF_HUB_UPLOAD_TIMEOUT'] = '600'
 try:
     from huggingface_hub import HfApi, create_repo, upload_file
-    from huggingface_hub import snapshot_download, hf_hub_download
     HF_AVAILABLE = True
 except ImportError:
     HF_AVAILABLE = False
     print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
-try:
-    import sys
-    import os
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
-    from monitoring import SmolLM3Monitor
-    MONITORING_AVAILABLE = True
-except ImportError:
-    MONITORING_AVAILABLE = False
-    print("Warning: monitoring module not available")
 logger = logging.getLogger(__name__)
-class TimeoutError(Exception):
-    """Custom timeout exception"""
-    pass
-def timeout_handler(signum, frame):
-    """Signal handler for timeout"""
-    raise TimeoutError("Operation timed out")
 class HuggingFacePusher:
-    """Push trained models and results to Hugging Face Hub with HF Datasets integration"""
     def __init__(
         self,
@@ -57,44 +44,22 @@ class HuggingFacePusher:
         repo_name: str,
         token: Optional[str] = None,
         private: bool = False,
-        trackio_url: Optional[str] = None,
-        experiment_name: Optional[str] = None,
-        dataset_repo: Optional[str] = None,
-        hf_token: Optional[str] = None,
         author_name: Optional[str] = None,
         model_description: Optional[str] = None,
-        training_config_type: Optional[str] = None,
         model_name: Optional[str] = None,
-        dataset_name: Optional[str] = None,
-        batch_size: Optional[str] = None,
-        learning_rate: Optional[str] = None,
-        max_epochs: Optional[str] = None,
-        max_seq_length: Optional[str] = None,
-        trainer_type: Optional[str] = None
     ):
         self.model_path = Path(model_path)
         # Original user input (may be just the repo name without username)
         self.repo_name = repo_name
-        self.token = token or hf_token or os.getenv('HF_TOKEN')
         self.private = private
-        self.trackio_url = trackio_url
-        self.experiment_name = experiment_name
         self.author_name = author_name
         self.model_description = model_description
-        # Training configuration details for model card generation
-        self.training_config_type = training_config_type
-        self.model_name = model_name
         self.dataset_name = dataset_name
-        self.batch_size = batch_size
-        self.learning_rate = learning_rate
-        self.max_epochs = max_epochs
-        self.max_seq_length = max_seq_length
-        self.trainer_type = trainer_type
-        # HF Datasets configuration
-        self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
-        self.hf_token = hf_token or os.getenv('HF_TOKEN')
         # Initialize HF API
         if HF_AVAILABLE:
@@ -105,19 +70,7 @@ class HuggingFacePusher:
         # Resolve the full repo id (username/repo) if user only provided repo name
         self.repo_id = self._resolve_repo_id(self.repo_name)
-        # Initialize monitoring if available
-        self.monitor = None
-        if MONITORING_AVAILABLE:
-            self.monitor = SmolLM3Monitor(
-                experiment_name=experiment_name or "model_push",
-                trackio_url=trackio_url,
-                enable_tracking=bool(trackio_url),
-                hf_token=self.hf_token,
-                dataset_repo=self.dataset_repo
-            )
         logger.info(f"Initialized HuggingFacePusher for {self.repo_id}")
-        logger.info(f"Dataset repository: {self.dataset_repo}")
     def _resolve_repo_id(self, repo_name: str) -> str:
         """Return a fully-qualified repo id in the form username/repo.
@@ -515,59 +468,33 @@ MIT License
             logger.error(f"❌ Failed to create README: {e}")
             return False
-    def log_to_trackio(self, action: str, details: Dict[str, Any]):
-        """Log push action to Trackio and HF Datasets"""
-        if self.monitor:
-            try:
-                # Log to Trackio
-                self.monitor.log_metrics({
-                    "push_action": action,
-                    "repo_name": self.repo_id,
-                    "model_size_gb": self._get_model_size(),
-                    "dataset_repo": self.dataset_repo,
-                    **details
-                })
-                # Log training summary
-                self.monitor.log_training_summary({
-                    "model_push": True,
-                    "model_repo": self.repo_id,
-                    "dataset_repo": self.dataset_repo,
-                    "push_date": datetime.now().isoformat(),
-                    **details
-                })
-                logger.info(f"✅ Logged {action} to Trackio and HF Datasets")
-            except Exception as e:
-                logger.error(f"❌ Failed to log to Trackio: {e}")
-    def push_model(self, training_config: Optional[Dict[str, Any]] = None,
                    results: Optional[Dict[str, Any]] = None) -> bool:
-        """Complete model push process with HF Datasets integration"""
         logger.info(f"🚀 Starting model push to {self.repo_id}")
-        logger.info(f"📊 Dataset repository: {self.dataset_repo}")
         # Validate model path
         if not self.validate_model_path():
             return False
         # Create repository
         if not self.create_repository():
             return False
         # Load training config and results if not provided
         if training_config is None:
             training_config = self._load_training_config()
         if results is None:
             results = self._load_training_results()
         # Create and upload model card
         model_card = self.create_model_card(training_config, results)
         model_card_path = Path("temp_model_card.md")
         with open(model_card_path, "w") as f:
             f.write(model_card)
         try:
             upload_file(
                 path_or_fileobj=str(model_card_path),
@@ -577,27 +504,135 @@ MIT License
             )
         finally:
             model_card_path.unlink()
         # Upload model files
         if not self.upload_model_files():
             return False
         # Upload training results
         if results:
             self.upload_training_results(str(self.model_path))
-        # Log to Trackio and HF Datasets
-        self.log_to_trackio("model_push", {
-            "model_path": str(self.model_path),
-            "repo_name": self.repo_name,
-            "private": self.private,
-            "training_config": training_config,
-            "results": results
-        })
         logger.info(f"🎉 Model successfully pushed to: https://huggingface.co/{self.repo_id}")
-        logger.info(f"📊 Experiment data stored in: {self.dataset_repo}")
         return True
     def _load_training_config(self) -> Dict[str, Any]:
         """Load training configuration"""
@@ -619,81 +654,94 @@ def parse_args():
     """Parse command line arguments"""
     parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')
-    # Required arguments
-    parser.add_argument('model_path', type=str, help='Path to trained model directory')
-    parser.add_argument('repo_name', type=str, help='Hugging Face repository name (repo-name). Username will be auto-detected from your token.')
-    # Optional arguments
-    parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
-    parser.add_argument('--hf-token', type=str, default=None, help='Hugging Face token (alternative to --token)')
-    parser.add_argument('--private', action='store_true', help='Make repository private')
-    parser.add_argument('--trackio-url', type=str, default=None, help='Trackio Space URL for logging')
-    parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for Trackio')
-    parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
-    parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
-    parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
-    parser.add_argument('--training-config-type', type=str, default=None, help='Training configuration type')
-    parser.add_argument('--model-name', type=str, default=None, help='Base model name')
-    parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
-    parser.add_argument('--batch-size', type=str, default=None, help='Batch size')
-    parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate')
-    parser.add_argument('--max-epochs', type=str, default=None, help='Maximum epochs')
-    parser.add_argument('--max-seq-length', type=str, default=None, help='Maximum sequence length')
-    parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type')
     return parser.parse_args()
 def main():
     """Main function"""
     args = parse_args()
     # Setup logging
     logging.basicConfig(
         level=logging.INFO,
         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
     )
-    logger.info("Starting model push to Hugging Face Hub")
-    # Initialize pusher
     try:
-        pusher = HuggingFacePusher(
-            model_path=args.model_path,
-            repo_name=args.repo_name,
-            token=args.token,
-            private=args.private,
-            trackio_url=args.trackio_url,
-            experiment_name=args.experiment_name,
-            dataset_repo=args.dataset_repo,
-            hf_token=args.hf_token,
-            author_name=args.author_name,
-            model_description=args.model_description,
-            training_config_type=args.training_config_type,
-            model_name=args.model_name,
-            dataset_name=args.dataset_name,
-            batch_size=args.batch_size,
-            learning_rate=args.learning_rate,
-            max_epochs=args.max_epochs,
-            max_seq_length=args.max_seq_length,
-            trainer_type=args.trainer_type
-        )
-        # Push model
-        success = pusher.push_model()
-        if success:
-            logger.info("✅ Model push completed successfully!")
-            logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
-            if args.dataset_repo:
-                logger.info(f"📊 View experiment data at: https://huggingface.co/datasets/{args.dataset_repo}")
-        else:
-            logger.error("❌ Model push failed!")
-            return 1
     except Exception as e:
-        logger.error(f"❌ Error during model push: {e}")
         return 1
     return 0
 if __name__ == "__main__":

 #!/usr/bin/env python3
 """
+Push Trained Models and Datasets to Hugging Face Hub
+Usage:
+    # Push a trained model
+    python push_to_huggingface.py model /path/to/model my-model-repo
+    # Push a dataset
+    python push_to_huggingface.py dataset /path/to/dataset.jsonl my-dataset-repo
+Authentication:
+Set HF_TOKEN environment variable or use --token:
+    export HF_TOKEN=your_token_here
 """
 import os
 import json
 import argparse
 import logging
 from pathlib import Path
+from typing import Dict, Any, Optional
 from datetime import datetime
 # Set timeout for HF operations to prevent hanging
 os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
 try:
     from huggingface_hub import HfApi, create_repo, upload_file
     HF_AVAILABLE = True
 except ImportError:
     HF_AVAILABLE = False
     print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
 logger = logging.getLogger(__name__)
 class HuggingFacePusher:
+    """Push trained models to Hugging Face Hub"""
     def __init__(
         self,
         repo_name: str,
         token: Optional[str] = None,
         private: bool = False,
         author_name: Optional[str] = None,
         model_description: Optional[str] = None,
         model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None
     ):
         self.model_path = Path(model_path)
         # Original user input (may be just the repo name without username)
         self.repo_name = repo_name
+        self.token = token or os.getenv('HF_TOKEN')
         self.private = private
         self.author_name = author_name
         self.model_description = model_description
+        # Model card generation details
+        self.model_name = model_name
         self.dataset_name = dataset_name
         # Initialize HF API
         if HF_AVAILABLE:
         # Resolve the full repo id (username/repo) if user only provided repo name
         self.repo_id = self._resolve_repo_id(self.repo_name)
         logger.info(f"Initialized HuggingFacePusher for {self.repo_id}")
     def _resolve_repo_id(self, repo_name: str) -> str:
         """Return a fully-qualified repo id in the form username/repo.
             logger.error(f"❌ Failed to create README: {e}")
             return False
+    def push_model(self, training_config: Optional[Dict[str, Any]] = None,
                    results: Optional[Dict[str, Any]] = None) -> bool:
+        """Complete model push process"""
         logger.info(f"🚀 Starting model push to {self.repo_id}")
         # Validate model path
         if not self.validate_model_path():
             return False
         # Create repository
         if not self.create_repository():
             return False
         # Load training config and results if not provided
         if training_config is None:
             training_config = self._load_training_config()
         if results is None:
             results = self._load_training_results()
         # Create and upload model card
         model_card = self.create_model_card(training_config, results)
         model_card_path = Path("temp_model_card.md")
         with open(model_card_path, "w") as f:
             f.write(model_card)
         try:
             upload_file(
                 path_or_fileobj=str(model_card_path),
             )
         finally:
             model_card_path.unlink()
         # Upload model files
         if not self.upload_model_files():
             return False
         # Upload training results
         if results:
             self.upload_training_results(str(self.model_path))
+        # Log success
+        logger.info(f"✅ Model successfully pushed to {self.repo_id}")
         logger.info(f"🎉 Model successfully pushed to: https://huggingface.co/{self.repo_id}")
         return True
+    def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool:
+        """Push dataset to Hugging Face Hub"""
+        logger.info(f"🚀 Starting dataset push to {dataset_repo_name}")
+        try:
+            from huggingface_hub import create_repo
+            import json
+            # Determine full dataset repo name
+            if "/" not in dataset_repo_name:
+                dataset_repo_name = f"{self.repo_id.split('/')[0]}/{dataset_repo_name}"
+            # Create dataset repository
+            try:
+                create_repo(dataset_repo_name, repo_type="dataset", token=self.token, exist_ok=True)
+                logger.info(f"✅ Created dataset repository: {dataset_repo_name}")
+            except Exception as e:
+                if "already exists" not in str(e).lower():
+                    logger.error(f"❌ Failed to create dataset repo: {e}")
+                    return False
+                logger.info(f"📁 Dataset repository already exists: {dataset_repo_name}")
+            # Read the dataset file
+            dataset_file = Path(dataset_path)
+            if not dataset_file.exists():
+                logger.error(f"❌ Dataset file not found: {dataset_path}")
+                return False
+            # Count lines for metadata
+            with open(dataset_file, 'r', encoding='utf-8') as f:
+                num_examples = sum(1 for _ in f)
+            file_size = dataset_file.stat().st_size
+            # Upload the dataset file
+            upload_file(
+                path_or_fileobj=str(dataset_file),
+                path_in_repo="data.jsonl",
+                repo_id=dataset_repo_name,
+                repo_type="dataset",
+                token=self.token
+            )
+            logger.info(f"✅ Uploaded dataset file: {dataset_file.name}")
+            # Create a dataset README
+            readme_content = f"""---
+dataset_info:
+  features:
+    - name: audio_path
+      dtype: string
+    - name: text
+      dtype: string
+  splits:
+    - name: train
+      num_bytes: {file_size}
+      num_examples: {num_examples}
+  download_size: {file_size}
+  dataset_size: {file_size}
+tags:
+- voxtral
+- asr
+- fine-tuning
+- conversational
+- speech-to-text
+- audio-to-text
+- tonic
+---
+# Voxtral ASR Dataset
+This dataset was created for fine-tuning Voxtral ASR models.
+## Dataset Structure
+- **audio_path**: Path to the audio file
+- **text**: Transcription of the audio
+## Statistics
+- Number of examples: {num_examples}
+- File size: {file_size} bytes
+## Usage
+```python
+from datasets import load_dataset
+dataset = load_dataset("{dataset_repo_name}")
+```
+"""
+            # Upload README
+            readme_path = dataset_file.parent / "README.md"
+            with open(readme_path, "w") as f:
+                f.write(readme_content)
+            upload_file(
+                path_or_fileobj=str(readme_path),
+                path_in_repo="README.md",
+                repo_id=dataset_repo_name,
+                repo_type="dataset",
+                token=self.token
+            )
+            readme_path.unlink()  # Clean up temp file
+            logger.info(f"✅ Dataset README uploaded")
+            logger.info(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}")
+            return True
+        except Exception as e:
+            logger.error(f"❌ Failed to push dataset: {e}")
+            return False
     def _load_training_config(self) -> Dict[str, Any]:
         """Load training configuration"""
     """Parse command line arguments"""
     parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')
+    # Subcommands
+    subparsers = parser.add_subparsers(dest='command', help='Available commands')
+    # Model push subcommand
+    model_parser = subparsers.add_parser('model', help='Push trained model to Hugging Face Hub')
+    model_parser.add_argument('model_path', type=str, help='Path to trained model directory')
+    model_parser.add_argument('repo_name', type=str, help='Hugging Face repository name (repo-name). Username will be auto-detected from your token.')
+    model_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
+    model_parser.add_argument('--private', action='store_true', help='Make repository private')
+    model_parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
+    model_parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
+    model_parser.add_argument('--model-name', type=str, default=None, help='Base model name')
+    model_parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
+    # Dataset push subcommand
+    dataset_parser = subparsers.add_parser('dataset', help='Push dataset to Hugging Face Hub')
+    dataset_parser.add_argument('dataset_path', type=str, help='Path to dataset JSONL file')
+    dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name')
+    dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
+    dataset_parser.add_argument('--private', action='store_true', help='Make repository private')
     return parser.parse_args()
 def main():
     """Main function"""
     args = parse_args()
     # Setup logging
     logging.basicConfig(
         level=logging.INFO,
         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
     )
+    if not args.command:
+        logger.error("❌ No command specified. Use 'model' or 'dataset' subcommand.")
+        return 1
     try:
+        if args.command == 'model':
+            logger.info("Starting model push to Hugging Face Hub")
+            # Initialize pusher
+            pusher = HuggingFacePusher(
+                model_path=args.model_path,
+                repo_name=args.repo_name,
+                token=args.token,
+                private=args.private,
+                author_name=args.author_name,
+                model_description=args.model_description,
+                model_name=args.model_name,
+                dataset_name=args.dataset_name
+            )
+            # Push model
+            success = pusher.push_model()
+            if success:
+                logger.info("✅ Model push completed successfully!")
+                logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
+            else:
+                logger.error("❌ Model push failed!")
+                return 1
+        elif args.command == 'dataset':
+            logger.info("Starting dataset push to Hugging Face Hub")
+            # Initialize pusher for dataset
+            pusher = HuggingFacePusher(
+                model_path="",  # Not needed for dataset push
+                repo_name=args.repo_name,
+                token=args.token,
+                private=args.private
+            )
+            # Push dataset
+            success = pusher.push_dataset(args.dataset_path, args.repo_name)
+            if success:
+                logger.info("✅ Dataset push completed successfully!")
+                logger.info(f"📊 View your dataset at: https://huggingface.co/datasets/{args.repo_name}")
+            else:
+                logger.error("❌ Dataset push failed!")
+                return 1
     except Exception as e:
+        logger.error(f"❌ Error during push: {e}")
         return 1
     return 0
 if __name__ == "__main__":

scripts/train.py CHANGED Viewed

@@ -1,8 +1,31 @@
 #!/usr/bin/env python3
 import argparse
 import json
 from pathlib import Path
 import torch
 from datasets import load_dataset, Audio, Dataset
 from transformers import (
@@ -11,6 +34,85 @@ from transformers import (
     Trainer,
     TrainingArguments,
 )
 class VoxtralDataCollator:
@@ -161,6 +263,12 @@ def main():
     parser.add_argument("--epochs", type=float, default=3)
     parser.add_argument("--logging-steps", type=int, default=10)
     parser.add_argument("--save-steps", type=int, default=50)
     args = parser.parse_args()
     model_checkpoint = args.model_checkpoint
@@ -169,6 +277,48 @@ def main():
     torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {torch_device}")
     print("Loading processor and model...")
     processor = VoxtralProcessor.from_pretrained(model_checkpoint)
     model = VoxtralForConditionalGeneration.from_pretrained(
@@ -200,7 +350,7 @@ def main():
         save_steps=args.save_steps,
         eval_strategy="steps" if eval_dataset else "no",
         save_strategy="steps",
-        report_to="none",
         remove_unused_columns=False,
         dataloader_num_workers=1,
     )
@@ -223,6 +373,44 @@ def main():
     if eval_dataset:
         results = trainer.evaluate()
         print(f"Final evaluation results: {results}")
     print("Training completed successfully!")

 #!/usr/bin/env python3
+"""
+Voxtral ASR Full Fine-tuning Script with Trackio Integration
+This script fine-tunes Voxtral models for ASR tasks with automatic experiment tracking
+via Trackio and Hugging Face Spaces.
+Features:
+- Automatic username detection from HF_TOKEN environment variable
+- Auto-generated space names with timestamps
+- Local-only mode when no HF_TOKEN is set
+- Comprehensive experiment logging
+- Optional dataset pushing to Hugging Face Hub
+Authentication:
+Set HF_TOKEN environment variable to enable automatic space creation:
+  Linux/Mac: export HF_TOKEN=your_token_here
+  Windows: set HF_TOKEN=your_token_here
+  Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
+Get your token from: https://huggingface.co/settings/tokens
+"""
 import argparse
 import json
 from pathlib import Path
+from datetime import datetime
+from typing import Tuple, Optional
 import torch
 from datasets import load_dataset, Audio, Dataset
 from transformers import (
     Trainer,
     TrainingArguments,
 )
+from huggingface_hub import HfApi
+import trackio
+def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
+    """
+    Validate a Hugging Face token and return the username.
+    Args:
+        token (str): The Hugging Face token to validate
+    Returns:
+        Tuple[bool, Optional[str], Optional[str]]:
+            - success: True if token is valid, False otherwise
+            - username: The username associated with the token (if valid)
+            - error_message: Error message if validation failed
+    """
+    try:
+        # Create API client with token directly
+        api = HfApi(token=token)
+        # Try to get user info - this will fail if token is invalid
+        user_info = api.whoami()
+        # Extract username from user info
+        username = user_info.get("name", user_info.get("username"))
+        if not username:
+            return False, None, "Could not retrieve username from token"
+        return True, username, None
+    except Exception as e:
+        error_msg = str(e)
+        if "401" in error_msg or "unauthorized" in error_msg.lower():
+            return False, None, "Invalid token - unauthorized access"
+        elif "403" in error_msg:
+            return False, None, "Token lacks required permissions"
+        elif "network" in error_msg.lower() or "connection" in error_msg.lower():
+            return False, None, f"Network error: {error_msg}"
+        else:
+            return False, None, f"Validation error: {error_msg}"
+def get_default_space_name(project_type: str = "voxtral-asr-finetuning") -> str:
+    """
+    Generate a default space name with username and timestamp.
+    Args:
+        project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
+    Returns:
+        str: Default space name in format "username/project-type-timestamp"
+    """
+    try:
+        # Get token from environment variables only
+        import os
+        token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+        if not token:
+            print("Warning: No HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable found.")
+            print("Set HF_TOKEN environment variable to enable automatic space creation.")
+            print("Example: export HF_TOKEN=your_token_here")
+            print("Falling back to local-only mode.")
+            return None
+        # Validate token and get username
+        success, username, error = validate_hf_token(token)
+        if success and username:
+            timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+            return f"{username}/{project_type}-{timestamp}"
+        else:
+            print(f"Warning: Token validation failed: {error}")
+            print("Falling back to local-only mode.")
+            return None
+    except Exception as e:
+        print(f"Warning: Failed to generate default space name: {e}")
+        return None
 class VoxtralDataCollator:
     parser.add_argument("--epochs", type=float, default=3)
     parser.add_argument("--logging-steps", type=int, default=10)
     parser.add_argument("--save-steps", type=int, default=50)
+    parser.add_argument("--trackio-space", type=str, default=None,
+                        help="Hugging Face Space ID for trackio logging (format: username/space-name). If not provided, will auto-generate based on HF token")
+    parser.add_argument("--push-dataset", action="store_true",
+                        help="Push the training dataset to Hugging Face Hub after training")
+    parser.add_argument("--dataset-repo", type=str, default=None,
+                        help="Dataset repository name for pushing dataset (format: username/dataset-name)")
     args = parser.parse_args()
     model_checkpoint = args.model_checkpoint
     torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {torch_device}")
+    # Determine trackio space
+    trackio_space = args.trackio_space
+    if not trackio_space:
+        trackio_space = get_default_space_name("voxtral-asr-finetuning")
+    # Initialize trackio for experiment tracking
+    if trackio_space:
+        print(f"Initializing trackio with space: {trackio_space}")
+        trackio.init(
+            project="voxtral-finetuning",
+            config={
+                "model_checkpoint": model_checkpoint,
+                "output_dir": output_dir,
+                "batch_size": args.batch_size,
+                "learning_rate": args.learning_rate,
+                "epochs": args.epochs,
+                "train_count": args.train_count,
+                "eval_count": args.eval_count,
+                "dataset_jsonl": args.dataset_jsonl,
+                "dataset_name": args.dataset_name,
+                "dataset_config": args.dataset_config,
+            },
+            space_id=trackio_space
+        )
+    else:
+        print("Initializing trackio in local-only mode")
+        trackio.init(
+            project="voxtral-finetuning",
+            config={
+                "model_checkpoint": model_checkpoint,
+                "output_dir": output_dir,
+                "batch_size": args.batch_size,
+                "learning_rate": args.learning_rate,
+                "epochs": args.epochs,
+                "train_count": args.train_count,
+                "eval_count": args.eval_count,
+                "dataset_jsonl": args.dataset_jsonl,
+                "dataset_name": args.dataset_name,
+                "dataset_config": args.dataset_config,
+            }
+        )
     print("Loading processor and model...")
     processor = VoxtralProcessor.from_pretrained(model_checkpoint)
     model = VoxtralForConditionalGeneration.from_pretrained(
         save_steps=args.save_steps,
         eval_strategy="steps" if eval_dataset else "no",
         save_strategy="steps",
+        report_to=["trackio"],
         remove_unused_columns=False,
         dataloader_num_workers=1,
     )
     if eval_dataset:
         results = trainer.evaluate()
         print(f"Final evaluation results: {results}")
+        # Log final evaluation results
+        trackio.log(results)
+    # Push dataset to Hub if requested
+    if args.push_dataset and args.dataset_jsonl:
+        print("Pushing dataset to Hugging Face Hub...")
+        try:
+            from pathlib import Path
+            import subprocess
+            dataset_repo = args.dataset_repo
+            if not dataset_repo:
+                # Auto-generate dataset repo name
+                if trackio_space:
+                    username = trackio_space.split('/')[0]
+                    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+                    dataset_repo = f"{username}/voxtral-dataset-{timestamp}"
+                else:
+                    print("Warning: Cannot auto-generate dataset repo name without HF token")
+                    dataset_repo = f"voxtral-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+            # Call the push script
+            push_cmd = [
+                "python", str(Path(__file__).parent / "push_to_huggingface.py"),
+                "dataset", args.dataset_jsonl, dataset_repo
+            ]
+            result = subprocess.run(push_cmd, capture_output=True, text=True)
+            if result.returncode == 0:
+                print(f"✅ Dataset pushed to: https://huggingface.co/datasets/{dataset_repo}")
+            else:
+                print(f"❌ Failed to push dataset: {result.stderr}")
+        except Exception as e:
+            print(f"❌ Error pushing dataset: {e}")
+    # Finish trackio logging
+    trackio.finish()
     print("Training completed successfully!")

scripts/train_lora.py CHANGED Viewed

@@ -1,8 +1,32 @@
 #!/usr/bin/env python3
 import argparse
 import json
 from pathlib import Path
 import torch
 from datasets import load_dataset, Audio, Dataset
 from transformers import (
@@ -12,6 +36,85 @@ from transformers import (
     TrainingArguments,
 )
 from peft import LoraConfig, get_peft_model
 class VoxtralDataCollator:
@@ -163,6 +266,12 @@ def main():
     parser.add_argument("--lora-alpha", type=int, default=32)
     parser.add_argument("--lora-dropout", type=float, default=0.0)
     parser.add_argument("--freeze-audio-tower", action="store_true", help="Freeze audio encoder parameters")
     args = parser.parse_args()
     model_checkpoint = args.model_checkpoint
@@ -171,6 +280,56 @@ def main():
     torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {torch_device}")
     print("Loading processor and model...")
     processor = VoxtralProcessor.from_pretrained(model_checkpoint)
     lora_cfg = LoraConfig(
@@ -210,12 +369,12 @@ def main():
         learning_rate=args.learning_rate,
         num_train_epochs=args.epochs,
         bf16=True,
-        logging_steps=args.logging_issues if hasattr(args, 'logging_issues') else args.logging_steps,
         eval_steps=args.save_steps if eval_dataset else None,
         save_steps=args.save_steps,
         eval_strategy="steps" if eval_dataset else "no",
         save_strategy="steps",
-        report_to="none",
         remove_unused_columns=False,
         dataloader_num_workers=1,
     )
@@ -238,6 +397,44 @@ def main():
     if eval_dataset:
         results = trainer.evaluate()
         print(f"Final evaluation results: {results}")
     print("Training completed successfully!")

 #!/usr/bin/env python3
+"""
+Voxtral ASR LoRA Fine-tuning Script with Trackio Integration
+This script fine-tunes Voxtral models using LoRA for ASR tasks with automatic experiment tracking
+via Trackio and Hugging Face Spaces.
+Features:
+- Automatic username detection from HF_TOKEN environment variable
+- Auto-generated space names with timestamps
+- Local-only mode when no HF_TOKEN is set
+- Comprehensive experiment logging
+- LoRA-specific hyperparameters tracking
+- Optional dataset pushing to Hugging Face Hub
+Authentication:
+Set HF_TOKEN environment variable to enable automatic space creation:
+  Linux/Mac: export HF_TOKEN=your_token_here
+  Windows: set HF_TOKEN=your_token_here
+  Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
+Get your token from: https://huggingface.co/settings/tokens
+"""
 import argparse
 import json
 from pathlib import Path
+from datetime import datetime
+from typing import Tuple, Optional
 import torch
 from datasets import load_dataset, Audio, Dataset
 from transformers import (
     TrainingArguments,
 )
 from peft import LoraConfig, get_peft_model
+from huggingface_hub import HfApi
+import trackio
+def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
+    """
+    Validate a Hugging Face token and return the username.
+    Args:
+        token (str): The Hugging Face token to validate
+    Returns:
+        Tuple[bool, Optional[str], Optional[str]]:
+            - success: True if token is valid, False otherwise
+            - username: The username associated with the token (if valid)
+            - error_message: Error message if validation failed
+    """
+    try:
+        # Create API client with token directly
+        api = HfApi(token=token)
+        # Try to get user info - this will fail if token is invalid
+        user_info = api.whoami()
+        # Extract username from user info
+        username = user_info.get("name", user_info.get("username"))
+        if not username:
+            return False, None, "Could not retrieve username from token"
+        return True, username, None
+    except Exception as e:
+        error_msg = str(e)
+        if "401" in error_msg or "unauthorized" in error_msg.lower():
+            return False, None, "Invalid token - unauthorized access"
+        elif "403" in error_msg:
+            return False, None, "Token lacks required permissions"
+        elif "network" in error_msg.lower() or "connection" in error_msg.lower():
+            return False, None, f"Network error: {error_msg}"
+        else:
+            return False, None, f"Validation error: {error_msg}"
+def get_default_space_name(project_type: str = "voxtral-lora-finetuning") -> str:
+    """
+    Generate a default space name with username and timestamp.
+    Args:
+        project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
+    Returns:
+        str: Default space name in format "username/project-type-timestamp"
+    """
+    try:
+        # Get token from environment variables only
+        import os
+        token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+        if not token:
+            print("Warning: No HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable found.")
+            print("Set HF_TOKEN environment variable to enable automatic space creation.")
+            print("Example: export HF_TOKEN=your_token_here")
+            print("Falling back to local-only mode.")
+            return None
+        # Validate token and get username
+        success, username, error = validate_hf_token(token)
+        if success and username:
+            timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+            return f"{username}/{project_type}-{timestamp}"
+        else:
+            print(f"Warning: Token validation failed: {error}")
+            print("Falling back to local-only mode.")
+            return None
+    except Exception as e:
+        print(f"Warning: Failed to generate default space name: {e}")
+        return None
 class VoxtralDataCollator:
     parser.add_argument("--lora-alpha", type=int, default=32)
     parser.add_argument("--lora-dropout", type=float, default=0.0)
     parser.add_argument("--freeze-audio-tower", action="store_true", help="Freeze audio encoder parameters")
+    parser.add_argument("--trackio-space", type=str, default=None,
+                        help="Hugging Face Space ID for trackio logging (format: username/space-name). If not provided, will auto-generate based on HF token")
+    parser.add_argument("--push-dataset", action="store_true",
+                        help="Push the training dataset to Hugging Face Hub after training")
+    parser.add_argument("--dataset-repo", type=str, default=None,
+                        help="Dataset repository name for pushing dataset (format: username/dataset-name)")
     args = parser.parse_args()
     model_checkpoint = args.model_checkpoint
     torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {torch_device}")
+    # Determine trackio space
+    trackio_space = args.trackio_space
+    if not trackio_space:
+        trackio_space = get_default_space_name("voxtral-lora-finetuning")
+    # Initialize trackio for experiment tracking
+    if trackio_space:
+        print(f"Initializing trackio with space: {trackio_space}")
+        trackio.init(
+            project="voxtral-lora-finetuning",
+            config={
+                "model_checkpoint": model_checkpoint,
+                "output_dir": output_dir,
+                "batch_size": args.batch_size,
+                "learning_rate": args.learning_rate,
+                "epochs": args.epochs,
+                "train_count": args.train_count,
+                "eval_count": args.eval_count,
+                "dataset_jsonl": args.dataset_jsonl,
+                "dataset_name": args.dataset_name,
+                "dataset_config": args.dataset_config,
+                "lora_r": args.lora_r,
+                "lora_alpha": args.lora_alpha,
+                "lora_dropout": args.lora_dropout,
+                "freeze_audio_tower": args.freeze_audio_tower,
+            },
+            space_id=trackio_space
+        )
+    else:
+        print("Initializing trackio in local-only mode")
+        trackio.init(
+            project="voxtral-lora-finetuning",
+            config={
+                "model_checkpoint": model_checkpoint,
+                "output_dir": output_dir,
+                "batch_size": args.batch_size,
+                "learning_rate": args.learning_rate,
+                "epochs": args.epochs,
+                "train_count": args.train_count,
+                "eval_count": args.eval_count,
+                "dataset_jsonl": args.dataset_jsonl,
+                "dataset_name": args.dataset_name,
+                "dataset_config": args.dataset_config,
+                "lora_r": args.lora_r,
+                "lora_alpha": args.lora_alpha,
+                "lora_dropout": args.lora_dropout,
+                "freeze_audio_tower": args.freeze_audio_tower,
+            }
+        )
     print("Loading processor and model...")
     processor = VoxtralProcessor.from_pretrained(model_checkpoint)
     lora_cfg = LoraConfig(
         learning_rate=args.learning_rate,
         num_train_epochs=args.epochs,
         bf16=True,
+        logging_steps=args.logging_steps,
         eval_steps=args.save_steps if eval_dataset else None,
         save_steps=args.save_steps,
         eval_strategy="steps" if eval_dataset else "no",
         save_strategy="steps",
+        report_to=["trackio"],
         remove_unused_columns=False,
         dataloader_num_workers=1,
     )
     if eval_dataset:
         results = trainer.evaluate()
         print(f"Final evaluation results: {results}")
+        # Log final evaluation results
+        trackio.log(results)
+    # Push dataset to Hub if requested
+    if args.push_dataset and args.dataset_jsonl:
+        print("Pushing dataset to Hugging Face Hub...")
+        try:
+            from pathlib import Path
+            import subprocess
+            dataset_repo = args.dataset_repo
+            if not dataset_repo:
+                # Auto-generate dataset repo name
+                if trackio_space:
+                    username = trackio_space.split('/')[0]
+                    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+                    dataset_repo = f"{username}/voxtral-dataset-{timestamp}"
+                else:
+                    print("Warning: Cannot auto-generate dataset repo name without HF token")
+                    dataset_repo = f"voxtral-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+            # Call the push script
+            push_cmd = [
+                "python", str(Path(__file__).parent / "push_to_huggingface.py"),
+                "dataset", args.dataset_jsonl, dataset_repo
+            ]
+            result = subprocess.run(push_cmd, capture_output=True, text=True)
+            if result.returncode == 0:
+                print(f"✅ Dataset pushed to: https://huggingface.co/datasets/{dataset_repo}")
+            else:
+                print(f"❌ Failed to push dataset: {result.stderr}")
+        except Exception as e:
+            print(f"❌ Error pushing dataset: {e}")
+    # Finish trackio logging
+    trackio.finish()
     print("Training completed successfully!")

templates/datasets/readme.md DELETED Viewed

@@ -1,171 +0,0 @@
----
-dataset_info:
-  features:
-  - name: experiment_id
-    dtype: string
-  - name: name
-    dtype: string
-  - name: description
-    dtype: string
-  - name: created_at
-    dtype: string
-  - name: status
-    dtype: string
-  - name: metrics
-    dtype: string
-  - name: parameters
-    dtype: string
-  - name: artifacts
-    dtype: string
-  - name: logs
-    dtype: string
-  - name: last_updated
-    dtype: string
-  splits:
-  - name: train
-    num_bytes: 4945
-    num_examples: 2
-  download_size: 15529
-  dataset_size: 4945
-configs:
-- config_name: default
-  data_files:
-  - split: train
-    path: data/train-*
-tags:
-- track tonic
-- tonic
-- experiment tracking
-- smollm3
-- fine-tuning
-- legml
-- hermes
----
-# Trackio Experiments Dataset
-This dataset stores experiment tracking data for ML training runs, particularly focused on SmolLM3 fine-tuning experiments with comprehensive metrics tracking.
-## Dataset Structure
-The dataset contains the following columns:
-- **experiment_id**: Unique identifier for each experiment
-- **name**: Human-readable name for the experiment
-- **description**: Detailed description of the experiment
-- **created_at**: Timestamp when the experiment was created
-- **status**: Current status (running, completed, failed, paused)
-- **metrics**: JSON string containing training metrics over time
-- **parameters**: JSON string containing experiment configuration
-- **artifacts**: JSON string containing experiment artifacts
-- **logs**: JSON string containing experiment logs
-- **last_updated**: Timestamp of last update
-## Metrics Structure
-The metrics field contains JSON arrays with the following structure:
-```json
-[
-  {
-    "timestamp": "2025-07-20T11:20:01.780908",
-    "step": 25,
-    "metrics": {
-      "loss": 1.1659,
-      "accuracy": 0.759,
-      "learning_rate": 7e-08,
-      "grad_norm": 10.3125,
-      "epoch": 0.004851130919895701,
-      // Advanced Training Metrics
-      "total_tokens": 1642080.0,
-      "truncated_tokens": 128,
-      "padding_tokens": 256,
-      "throughput": 3284160.0,
-      "step_time": 0.5,
-      "batch_size": 8,
-      "seq_len": 2048,
-      "token_acc": 0.759,
-      // Custom Losses
-      "train/gate_ortho": 0.0234,
-      "train/center": 0.0156,
-      // System Metrics
-      "gpu_memory_allocated": 17.202261447906494,
-      "gpu_memory_reserved": 75.474609375,
-      "gpu_utilization": 85.2,
-      "cpu_percent": 2.7,
-      "memory_percent": 10.1
-    }
-  }
-]
-```
-## Supported Metrics
-### Core Training Metrics
-- **loss**: Training loss value
-- **accuracy**: Model accuracy
-- **learning_rate**: Current learning rate
-- **grad_norm**: Gradient norm
-- **epoch**: Current epoch progress
-### Advanced Token Metrics
-- **total_tokens**: Total tokens processed in the batch
-- **truncated_tokens**: Number of tokens truncated during processing
-- **padding_tokens**: Number of padding tokens added
-- **throughput**: Tokens processed per second
-- **step_time**: Time taken for the current training step
-- **batch_size**: Current batch size
-- **seq_len**: Sequence length
-- **token_acc**: Token-level accuracy
-### Custom Losses (SmolLM3-specific)
-- **train/gate_ortho**: Gate orthogonality loss
-- **train/center**: Center loss component
-### System Performance Metrics
-- **gpu_memory_allocated**: GPU memory currently allocated (GB)
-- **gpu_memory_reserved**: GPU memory reserved (GB)
-- **gpu_utilization**: GPU utilization percentage
-- **cpu_percent**: CPU usage percentage
-- **memory_percent**: System memory usage percentage
-## Usage
-This dataset is automatically used by the Trackio monitoring system to store and retrieve experiment data. It provides persistent storage for experiment tracking across different training runs.
-## Integration
-The dataset is used by:
-- Trackio Spaces for experiment visualization
-- Training scripts for logging metrics and parameters
-- Monitoring systems for experiment tracking
-- SmolLM3 fine-tuning pipeline for comprehensive metrics capture
-## Privacy
-This dataset is private by default to ensure experiment data security. Only users with appropriate permissions can access the data.
-## Examples
-### Sample Experiment Entry
-```json
-{
-  "experiment_id": "exp_20250720_130853",
-  "name": "smollm3_finetune",
-  "description": "SmolLM3 fine-tuning experiment with comprehensive metrics",
-  "created_at": "2025-07-20T11:20:01.780908",
-  "status": "running",
-  "metrics": "[{\"timestamp\": \"2025-07-20T11:20:01.780908\", \"step\": 25, \"metrics\": {\"loss\": 1.1659, \"accuracy\": 0.759, \"total_tokens\": 1642080.0, \"throughput\": 3284160.0, \"train/gate_ortho\": 0.0234, \"train/center\": 0.0156}}]",
-  "parameters": "{\"model_name\": \"HuggingFaceTB/SmolLM3-3B\", \"batch_size\": 8, \"learning_rate\": 3.5e-06, \"max_seq_length\": 12288}",
-  "artifacts": "[]",
-  "logs": "[]",
-  "last_updated": "2025-07-20T11:20:01.780908"
-}
-```
-## License
-This dataset is part of the Trackio experiment tracking system and follows the same license as the main project.

tests/test_hf_setup.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#!/usr/bin/env python3
+"""
+Test Hugging Face Setup for Trackio Integration
+This script helps verify your Hugging Face token setup and test space name generation.
+Run this before using the training scripts to ensure everything is configured correctly.
+Authentication:
+This script only checks for HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variables.
+It does NOT use huggingface-cli login state.
+Setup:
+  Linux/Mac: export HF_TOKEN=your_token_here
+  Windows: set HF_TOKEN=your_token_here
+  Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
+Get your token from: https://huggingface.co/settings/tokens
+"""
+import os
+from datetime import datetime
+from typing import Tuple, Optional
+from huggingface_hub import HfApi
+def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
+    """
+    Validate a Hugging Face token and return the username.
+    Args:
+        token (str): The Hugging Face token to validate
+    Returns:
+        Tuple[bool, Optional[str], Optional[str]]:
+            - success: True if token is valid, False otherwise
+            - username: The username associated with the token (if valid)
+            - error_message: Error message if validation failed
+    """
+    try:
+        # Create API client with token directly
+        api = HfApi(token=token)
+        # Try to get user info - this will fail if token is invalid
+        user_info = api.whoami()
+        # Extract username from user info
+        username = user_info.get("name", user_info.get("username"))
+        if not username:
+            return False, None, "Could not retrieve username from token"
+        return True, username, None
+    except Exception as e:
+        error_msg = str(e)
+        if "401" in error_msg or "unauthorized" in error_msg.lower():
+            return False, None, "Invalid token - unauthorized access"
+        elif "403" in error_msg:
+            return False, None, "Token lacks required permissions"
+        elif "network" in error_msg.lower() or "connection" in error_msg.lower():
+            return False, None, f"Network error: {error_msg}"
+        else:
+            return False, None, f"Validation error: {error_msg}"
+def get_default_space_name(project_type: str = "voxtral-asr-finetuning") -> str:
+    """
+    Generate a default space name with username and timestamp.
+    Args:
+        project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
+    Returns:
+        str: Default space name in format "username/project-type-timestamp"
+    """
+    try:
+        # Get token from environment variables only
+        token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+        if not token:
+            return None
+        # Validate token and get username
+        success, username, error = validate_hf_token(token)
+        if success and username:
+            timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+            return f"{username}/{project_type}-{timestamp}"
+        else:
+            return None
+    except Exception as e:
+        print(f"Failed to generate default space name: {e}")
+        return None
+def main():
+    print("🔍 Testing Hugging Face Setup for Trackio Integration")
+    print("=" * 60)
+    # Check for tokens
+    print("\n1. Checking for Hugging Face tokens...")
+    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+    if token:
+        print(f"✅ Found token in environment: {token[:10]}...")
+    else:
+        print("❌ No token found in environment variables")
+        print("\n❌ No Hugging Face token found!")
+        print("Please set the HF_TOKEN environment variable:")
+        print("  Linux/Mac: export HF_TOKEN=your_token_here")
+        print("  Windows: set HF_TOKEN=your_token_here")
+        print("  Or: set HUGGINGFACE_HUB_TOKEN=your_token_here")
+        print("\nGet your token from: https://huggingface.co/settings/tokens")
+        return
+    # Validate token
+    print("\n2. Validating token...")
+    success, username, error = validate_hf_token(token)
+    if success:
+        print(f"✅ Token is valid! Username: {username}")
+    else:
+        print(f"❌ Token validation failed: {error}")
+        return
+    # Generate space names
+    print("\n3. Generating default space names...")
+    full_finetune_space = get_default_space_name("voxtral-asr-finetuning")
+    lora_finetune_space = get_default_space_name("voxtral-lora-finetuning")
+    print(f"📁 Full fine-tuning space: {full_finetune_space}")
+    print(f"📁 LoRA fine-tuning space: {lora_finetune_space}")
+    print("\n✅ Setup complete! You can now run training scripts.")
+    print("   They will automatically use the generated space names.")
+    print("\n💡 To override the auto-generated names, use --trackio-space yourname/custom-space")
+if __name__ == "__main__":
+    main()