Spaces:
Running
Running
Joseph Pollack
commited on
adds automatic authentication , dataset readme , push to hub automation , demo , readme , and interface improvements
Browse files- interface.py +120 -3
- requirements.txt +3 -1
- scripts/push_to_huggingface.py +223 -175
- scripts/train.py +189 -1
- scripts/train_lora.py +199 -2
- templates/datasets/readme.md +0 -171
- tests/test_hf_setup.py +141 -0
interface.py
CHANGED
|
@@ -155,6 +155,104 @@ def _save_uploaded_dataset(files: list, transcripts: list[str]) -> str:
|
|
| 155 |
return str(jsonl_path)
|
| 156 |
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
def _save_recordings(recordings: list[tuple[int, list]], transcripts: list[str]) -> str:
|
| 159 |
import soundfile as sf
|
| 160 |
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
|
@@ -231,6 +329,7 @@ def start_voxtral_training(
|
|
| 231 |
repo_name = f"{username}/{repo_short}" if username else repo_short
|
| 232 |
push_args = [
|
| 233 |
str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
|
|
|
|
| 234 |
str(output_dir),
|
| 235 |
repo_name,
|
| 236 |
]
|
|
@@ -519,6 +618,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 519 |
gr.update(visible=True), # dataset_status
|
| 520 |
gr.update(visible=True), # advanced_accordion
|
| 521 |
gr.update(visible=True), # save_rec_btn
|
|
|
|
| 522 |
gr.update(visible=True), # start_btn
|
| 523 |
gr.update(visible=True), # logs_box
|
| 524 |
]
|
|
@@ -607,17 +707,27 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 607 |
gr.Markdown("### Upload audio + transcripts (optional)")
|
| 608 |
upload_audio = gr.File(file_count="multiple", type="filepath", label="Upload WAV/FLAC files (optional)")
|
| 609 |
transcripts_box = gr.Textbox(lines=6, label="Transcripts (one per line, aligned with files)")
|
|
|
|
|
|
|
| 610 |
save_upload_btn = gr.Button("Save uploaded dataset")
|
|
|
|
| 611 |
|
| 612 |
def _collect_upload(files, txt):
|
| 613 |
lines = [s.strip() for s in (txt or "").splitlines() if s.strip()]
|
| 614 |
-
|
|
|
|
| 615 |
|
| 616 |
-
|
| 617 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
| 619 |
# Save recordings button
|
| 620 |
save_rec_btn = gr.Button("Save recordings as dataset", visible=False)
|
|
|
|
| 621 |
|
| 622 |
def _collect_preloaded_recs(*recs_and_texts):
|
| 623 |
import soundfile as sf
|
|
@@ -646,6 +756,13 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
| 646 |
|
| 647 |
save_rec_btn.click(_collect_preloaded_recs, rec_components + [phrase_texts_state], [jsonl_path_state])
|
| 648 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
# Removed multilingual dataset sample section - phrases are now loaded automatically when language is selected
|
| 650 |
|
| 651 |
start_btn = gr.Button("Start Fine-tuning", visible=False)
|
|
|
|
| 155 |
return str(jsonl_path)
|
| 156 |
|
| 157 |
|
| 158 |
+
def _push_dataset_to_hub(jsonl_path: str, repo_name: str, username: str = "") -> str:
|
| 159 |
+
"""Push dataset to Hugging Face Hub"""
|
| 160 |
+
try:
|
| 161 |
+
from huggingface_hub import HfApi, create_repo
|
| 162 |
+
import json
|
| 163 |
+
from pathlib import Path
|
| 164 |
+
|
| 165 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 166 |
+
|
| 167 |
+
if not token:
|
| 168 |
+
return "β No HF_TOKEN found. Set HF_TOKEN environment variable to push datasets."
|
| 169 |
+
|
| 170 |
+
api = HfApi(token=token)
|
| 171 |
+
|
| 172 |
+
# Determine full repo name
|
| 173 |
+
if "/" not in repo_name:
|
| 174 |
+
if not username:
|
| 175 |
+
user_info = api.whoami()
|
| 176 |
+
username = user_info.get("name") or user_info.get("username") or ""
|
| 177 |
+
if username:
|
| 178 |
+
repo_name = f"{username}/{repo_name}"
|
| 179 |
+
|
| 180 |
+
# Create dataset repository
|
| 181 |
+
try:
|
| 182 |
+
create_repo(repo_name, repo_type="dataset", token=token, exist_ok=True)
|
| 183 |
+
except Exception as e:
|
| 184 |
+
if "already exists" not in str(e).lower():
|
| 185 |
+
return f"β Failed to create dataset repo: {e}"
|
| 186 |
+
|
| 187 |
+
# Read the JSONL file
|
| 188 |
+
jsonl_file = Path(jsonl_path)
|
| 189 |
+
if not jsonl_file.exists():
|
| 190 |
+
return f"β Dataset file not found: {jsonl_path}"
|
| 191 |
+
|
| 192 |
+
# Upload the JSONL file
|
| 193 |
+
api.upload_file(
|
| 194 |
+
path_or_fileobj=str(jsonl_file),
|
| 195 |
+
path_in_repo="data.jsonl",
|
| 196 |
+
repo_id=repo_name,
|
| 197 |
+
repo_type="dataset",
|
| 198 |
+
token=token
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# Create a simple README for the dataset
|
| 202 |
+
readme_content = f"""---
|
| 203 |
+
dataset_info:
|
| 204 |
+
features:
|
| 205 |
+
- name: audio_path
|
| 206 |
+
dtype: string
|
| 207 |
+
- name: text
|
| 208 |
+
dtype: string
|
| 209 |
+
splits:
|
| 210 |
+
- name: train
|
| 211 |
+
num_bytes: {jsonl_file.stat().st_size}
|
| 212 |
+
num_examples: {sum(1 for _ in open(jsonl_file))}
|
| 213 |
+
download_size: {jsonl_file.stat().st_size}
|
| 214 |
+
dataset_size: {jsonl_file.stat().st_size}
|
| 215 |
+
---
|
| 216 |
+
|
| 217 |
+
# Voxtral ASR Dataset
|
| 218 |
+
|
| 219 |
+
This dataset was created using the Voxtral ASR Fine-tuning Interface.
|
| 220 |
+
|
| 221 |
+
## Dataset Structure
|
| 222 |
+
|
| 223 |
+
- **audio_path**: Path to the audio file
|
| 224 |
+
- **text**: Transcription of the audio
|
| 225 |
+
|
| 226 |
+
## Usage
|
| 227 |
+
|
| 228 |
+
```python
|
| 229 |
+
from datasets import load_dataset
|
| 230 |
+
|
| 231 |
+
dataset = load_dataset("{repo_name}")
|
| 232 |
+
```
|
| 233 |
+
"""
|
| 234 |
+
|
| 235 |
+
# Upload README
|
| 236 |
+
readme_path = jsonl_file.parent / "README.md"
|
| 237 |
+
with open(readme_path, "w") as f:
|
| 238 |
+
f.write(readme_content)
|
| 239 |
+
|
| 240 |
+
api.upload_file(
|
| 241 |
+
path_or_fileobj=str(readme_path),
|
| 242 |
+
path_in_repo="README.md",
|
| 243 |
+
repo_id=repo_name,
|
| 244 |
+
repo_type="dataset",
|
| 245 |
+
token=token
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
readme_path.unlink() # Clean up temp file
|
| 249 |
+
|
| 250 |
+
return f"β
Dataset pushed to: https://huggingface.co/datasets/{repo_name}"
|
| 251 |
+
|
| 252 |
+
except Exception as e:
|
| 253 |
+
return f"β Failed to push dataset: {e}"
|
| 254 |
+
|
| 255 |
+
|
| 256 |
def _save_recordings(recordings: list[tuple[int, list]], transcripts: list[str]) -> str:
|
| 257 |
import soundfile as sf
|
| 258 |
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
|
|
|
| 329 |
repo_name = f"{username}/{repo_short}" if username else repo_short
|
| 330 |
push_args = [
|
| 331 |
str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
|
| 332 |
+
"model",
|
| 333 |
str(output_dir),
|
| 334 |
repo_name,
|
| 335 |
]
|
|
|
|
| 618 |
gr.update(visible=True), # dataset_status
|
| 619 |
gr.update(visible=True), # advanced_accordion
|
| 620 |
gr.update(visible=True), # save_rec_btn
|
| 621 |
+
gr.update(visible=True), # push_recordings_btn
|
| 622 |
gr.update(visible=True), # start_btn
|
| 623 |
gr.update(visible=True), # logs_box
|
| 624 |
]
|
|
|
|
| 707 |
gr.Markdown("### Upload audio + transcripts (optional)")
|
| 708 |
upload_audio = gr.File(file_count="multiple", type="filepath", label="Upload WAV/FLAC files (optional)")
|
| 709 |
transcripts_box = gr.Textbox(lines=6, label="Transcripts (one per line, aligned with files)")
|
| 710 |
+
dataset_repo_name = gr.Textbox(value=f"voxtral-dataset-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
| 711 |
+
label="Dataset repo name (will be pushed to HF Hub)")
|
| 712 |
save_upload_btn = gr.Button("Save uploaded dataset")
|
| 713 |
+
push_dataset_btn = gr.Button("Push dataset to HF Hub")
|
| 714 |
|
| 715 |
def _collect_upload(files, txt):
|
| 716 |
lines = [s.strip() for s in (txt or "").splitlines() if s.strip()]
|
| 717 |
+
jsonl_path = _save_uploaded_dataset(files or [], lines)
|
| 718 |
+
return f"β
Dataset saved locally: {jsonl_path}"
|
| 719 |
|
| 720 |
+
def _push_dataset_handler(repo_name):
|
| 721 |
+
if not jsonl_path_state.value:
|
| 722 |
+
return "β No dataset saved yet. Please save dataset first."
|
| 723 |
+
return _push_dataset_to_hub(jsonl_path_state.value, repo_name)
|
| 724 |
+
|
| 725 |
+
save_upload_btn.click(_collect_upload, [upload_audio, transcripts_box], [jsonl_path_state])
|
| 726 |
+
push_dataset_btn.click(_push_dataset_handler, [dataset_repo_name], [jsonl_path_state])
|
| 727 |
|
| 728 |
# Save recordings button
|
| 729 |
save_rec_btn = gr.Button("Save recordings as dataset", visible=False)
|
| 730 |
+
push_recordings_btn = gr.Button("Push recordings dataset to HF Hub", visible=False)
|
| 731 |
|
| 732 |
def _collect_preloaded_recs(*recs_and_texts):
|
| 733 |
import soundfile as sf
|
|
|
|
| 756 |
|
| 757 |
save_rec_btn.click(_collect_preloaded_recs, rec_components + [phrase_texts_state], [jsonl_path_state])
|
| 758 |
|
| 759 |
+
def _push_recordings_handler(repo_name):
|
| 760 |
+
if not jsonl_path_state.value:
|
| 761 |
+
return "β No recordings dataset saved yet. Please save recordings first."
|
| 762 |
+
return _push_dataset_to_hub(jsonl_path_state.value, repo_name)
|
| 763 |
+
|
| 764 |
+
push_recordings_btn.click(_push_recordings_handler, [dataset_repo_name], [jsonl_path_state])
|
| 765 |
+
|
| 766 |
# Removed multilingual dataset sample section - phrases are now loaded automatically when language is selected
|
| 767 |
|
| 768 |
start_btn = gr.Button("Start Fine-tuning", visible=False)
|
requirements.txt
CHANGED
|
@@ -2,4 +2,6 @@ torch
|
|
| 2 |
datasets
|
| 3 |
peft
|
| 4 |
transformers
|
| 5 |
-
gradio
|
|
|
|
|
|
|
|
|
| 2 |
datasets
|
| 3 |
peft
|
| 4 |
transformers
|
| 5 |
+
gradio
|
| 6 |
+
trackio
|
| 7 |
+
huggingface_hub
|
scripts/push_to_huggingface.py
CHANGED
|
@@ -1,20 +1,26 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
Push Trained
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os
|
| 8 |
import json
|
| 9 |
import argparse
|
| 10 |
import logging
|
| 11 |
-
import time
|
| 12 |
from pathlib import Path
|
| 13 |
-
from typing import Dict, Any, Optional
|
| 14 |
from datetime import datetime
|
| 15 |
-
import subprocess
|
| 16 |
-
import shutil
|
| 17 |
-
import platform
|
| 18 |
|
| 19 |
# Set timeout for HF operations to prevent hanging
|
| 20 |
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
|
|
@@ -22,34 +28,15 @@ os.environ['HF_HUB_UPLOAD_TIMEOUT'] = '600'
|
|
| 22 |
|
| 23 |
try:
|
| 24 |
from huggingface_hub import HfApi, create_repo, upload_file
|
| 25 |
-
from huggingface_hub import snapshot_download, hf_hub_download
|
| 26 |
HF_AVAILABLE = True
|
| 27 |
except ImportError:
|
| 28 |
HF_AVAILABLE = False
|
| 29 |
print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
|
| 30 |
|
| 31 |
-
try:
|
| 32 |
-
import sys
|
| 33 |
-
import os
|
| 34 |
-
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
|
| 35 |
-
from monitoring import SmolLM3Monitor
|
| 36 |
-
MONITORING_AVAILABLE = True
|
| 37 |
-
except ImportError:
|
| 38 |
-
MONITORING_AVAILABLE = False
|
| 39 |
-
print("Warning: monitoring module not available")
|
| 40 |
-
|
| 41 |
logger = logging.getLogger(__name__)
|
| 42 |
|
| 43 |
-
class TimeoutError(Exception):
|
| 44 |
-
"""Custom timeout exception"""
|
| 45 |
-
pass
|
| 46 |
-
|
| 47 |
-
def timeout_handler(signum, frame):
|
| 48 |
-
"""Signal handler for timeout"""
|
| 49 |
-
raise TimeoutError("Operation timed out")
|
| 50 |
-
|
| 51 |
class HuggingFacePusher:
|
| 52 |
-
"""Push trained models
|
| 53 |
|
| 54 |
def __init__(
|
| 55 |
self,
|
|
@@ -57,44 +44,22 @@ class HuggingFacePusher:
|
|
| 57 |
repo_name: str,
|
| 58 |
token: Optional[str] = None,
|
| 59 |
private: bool = False,
|
| 60 |
-
trackio_url: Optional[str] = None,
|
| 61 |
-
experiment_name: Optional[str] = None,
|
| 62 |
-
dataset_repo: Optional[str] = None,
|
| 63 |
-
hf_token: Optional[str] = None,
|
| 64 |
author_name: Optional[str] = None,
|
| 65 |
model_description: Optional[str] = None,
|
| 66 |
-
training_config_type: Optional[str] = None,
|
| 67 |
model_name: Optional[str] = None,
|
| 68 |
-
dataset_name: Optional[str] = None
|
| 69 |
-
batch_size: Optional[str] = None,
|
| 70 |
-
learning_rate: Optional[str] = None,
|
| 71 |
-
max_epochs: Optional[str] = None,
|
| 72 |
-
max_seq_length: Optional[str] = None,
|
| 73 |
-
trainer_type: Optional[str] = None
|
| 74 |
):
|
| 75 |
self.model_path = Path(model_path)
|
| 76 |
# Original user input (may be just the repo name without username)
|
| 77 |
self.repo_name = repo_name
|
| 78 |
-
self.token = token or
|
| 79 |
self.private = private
|
| 80 |
-
self.trackio_url = trackio_url
|
| 81 |
-
self.experiment_name = experiment_name
|
| 82 |
self.author_name = author_name
|
| 83 |
self.model_description = model_description
|
| 84 |
-
|
| 85 |
-
#
|
| 86 |
-
self.
|
| 87 |
-
self.model_name = model_name
|
| 88 |
self.dataset_name = dataset_name
|
| 89 |
-
self.batch_size = batch_size
|
| 90 |
-
self.learning_rate = learning_rate
|
| 91 |
-
self.max_epochs = max_epochs
|
| 92 |
-
self.max_seq_length = max_seq_length
|
| 93 |
-
self.trainer_type = trainer_type
|
| 94 |
-
|
| 95 |
-
# HF Datasets configuration
|
| 96 |
-
self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
| 97 |
-
self.hf_token = hf_token or os.getenv('HF_TOKEN')
|
| 98 |
|
| 99 |
# Initialize HF API
|
| 100 |
if HF_AVAILABLE:
|
|
@@ -105,19 +70,7 @@ class HuggingFacePusher:
|
|
| 105 |
# Resolve the full repo id (username/repo) if user only provided repo name
|
| 106 |
self.repo_id = self._resolve_repo_id(self.repo_name)
|
| 107 |
|
| 108 |
-
# Initialize monitoring if available
|
| 109 |
-
self.monitor = None
|
| 110 |
-
if MONITORING_AVAILABLE:
|
| 111 |
-
self.monitor = SmolLM3Monitor(
|
| 112 |
-
experiment_name=experiment_name or "model_push",
|
| 113 |
-
trackio_url=trackio_url,
|
| 114 |
-
enable_tracking=bool(trackio_url),
|
| 115 |
-
hf_token=self.hf_token,
|
| 116 |
-
dataset_repo=self.dataset_repo
|
| 117 |
-
)
|
| 118 |
-
|
| 119 |
logger.info(f"Initialized HuggingFacePusher for {self.repo_id}")
|
| 120 |
-
logger.info(f"Dataset repository: {self.dataset_repo}")
|
| 121 |
|
| 122 |
def _resolve_repo_id(self, repo_name: str) -> str:
|
| 123 |
"""Return a fully-qualified repo id in the form username/repo.
|
|
@@ -515,59 +468,33 @@ MIT License
|
|
| 515 |
logger.error(f"β Failed to create README: {e}")
|
| 516 |
return False
|
| 517 |
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
if self.monitor:
|
| 521 |
-
try:
|
| 522 |
-
# Log to Trackio
|
| 523 |
-
self.monitor.log_metrics({
|
| 524 |
-
"push_action": action,
|
| 525 |
-
"repo_name": self.repo_id,
|
| 526 |
-
"model_size_gb": self._get_model_size(),
|
| 527 |
-
"dataset_repo": self.dataset_repo,
|
| 528 |
-
**details
|
| 529 |
-
})
|
| 530 |
-
|
| 531 |
-
# Log training summary
|
| 532 |
-
self.monitor.log_training_summary({
|
| 533 |
-
"model_push": True,
|
| 534 |
-
"model_repo": self.repo_id,
|
| 535 |
-
"dataset_repo": self.dataset_repo,
|
| 536 |
-
"push_date": datetime.now().isoformat(),
|
| 537 |
-
**details
|
| 538 |
-
})
|
| 539 |
-
|
| 540 |
-
logger.info(f"β
Logged {action} to Trackio and HF Datasets")
|
| 541 |
-
except Exception as e:
|
| 542 |
-
logger.error(f"β Failed to log to Trackio: {e}")
|
| 543 |
-
|
| 544 |
-
def push_model(self, training_config: Optional[Dict[str, Any]] = None,
|
| 545 |
results: Optional[Dict[str, Any]] = None) -> bool:
|
| 546 |
-
"""Complete model push process
|
| 547 |
logger.info(f"π Starting model push to {self.repo_id}")
|
| 548 |
-
|
| 549 |
-
|
| 550 |
# Validate model path
|
| 551 |
if not self.validate_model_path():
|
| 552 |
return False
|
| 553 |
-
|
| 554 |
# Create repository
|
| 555 |
if not self.create_repository():
|
| 556 |
return False
|
| 557 |
-
|
| 558 |
# Load training config and results if not provided
|
| 559 |
if training_config is None:
|
| 560 |
training_config = self._load_training_config()
|
| 561 |
-
|
| 562 |
if results is None:
|
| 563 |
results = self._load_training_results()
|
| 564 |
-
|
| 565 |
# Create and upload model card
|
| 566 |
model_card = self.create_model_card(training_config, results)
|
| 567 |
model_card_path = Path("temp_model_card.md")
|
| 568 |
with open(model_card_path, "w") as f:
|
| 569 |
f.write(model_card)
|
| 570 |
-
|
| 571 |
try:
|
| 572 |
upload_file(
|
| 573 |
path_or_fileobj=str(model_card_path),
|
|
@@ -577,27 +504,135 @@ MIT License
|
|
| 577 |
)
|
| 578 |
finally:
|
| 579 |
model_card_path.unlink()
|
| 580 |
-
|
| 581 |
# Upload model files
|
| 582 |
if not self.upload_model_files():
|
| 583 |
return False
|
| 584 |
-
|
| 585 |
# Upload training results
|
| 586 |
if results:
|
| 587 |
self.upload_training_results(str(self.model_path))
|
| 588 |
-
|
| 589 |
-
# Log
|
| 590 |
-
|
| 591 |
-
"model_path": str(self.model_path),
|
| 592 |
-
"repo_name": self.repo_name,
|
| 593 |
-
"private": self.private,
|
| 594 |
-
"training_config": training_config,
|
| 595 |
-
"results": results
|
| 596 |
-
})
|
| 597 |
-
|
| 598 |
logger.info(f"π Model successfully pushed to: https://huggingface.co/{self.repo_id}")
|
| 599 |
-
|
| 600 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
|
| 602 |
def _load_training_config(self) -> Dict[str, Any]:
|
| 603 |
"""Load training configuration"""
|
|
@@ -619,81 +654,94 @@ def parse_args():
|
|
| 619 |
"""Parse command line arguments"""
|
| 620 |
parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')
|
| 621 |
|
| 622 |
-
#
|
| 623 |
-
parser.
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type')
|
| 643 |
|
| 644 |
return parser.parse_args()
|
| 645 |
|
| 646 |
def main():
|
| 647 |
"""Main function"""
|
| 648 |
args = parse_args()
|
| 649 |
-
|
| 650 |
# Setup logging
|
| 651 |
logging.basicConfig(
|
| 652 |
level=logging.INFO,
|
| 653 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 654 |
)
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
|
|
|
| 659 |
try:
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
logger.info(
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
except Exception as e:
|
| 694 |
-
logger.error(f"β Error during
|
| 695 |
return 1
|
| 696 |
-
|
| 697 |
return 0
|
| 698 |
|
| 699 |
if __name__ == "__main__":
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Push Trained Models and Datasets to Hugging Face Hub
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
# Push a trained model
|
| 7 |
+
python push_to_huggingface.py model /path/to/model my-model-repo
|
| 8 |
+
|
| 9 |
+
# Push a dataset
|
| 10 |
+
python push_to_huggingface.py dataset /path/to/dataset.jsonl my-dataset-repo
|
| 11 |
+
|
| 12 |
+
Authentication:
|
| 13 |
+
Set HF_TOKEN environment variable or use --token:
|
| 14 |
+
export HF_TOKEN=your_token_here
|
| 15 |
"""
|
| 16 |
|
| 17 |
import os
|
| 18 |
import json
|
| 19 |
import argparse
|
| 20 |
import logging
|
|
|
|
| 21 |
from pathlib import Path
|
| 22 |
+
from typing import Dict, Any, Optional
|
| 23 |
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# Set timeout for HF operations to prevent hanging
|
| 26 |
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
|
|
|
|
| 28 |
|
| 29 |
try:
|
| 30 |
from huggingface_hub import HfApi, create_repo, upload_file
|
|
|
|
| 31 |
HF_AVAILABLE = True
|
| 32 |
except ImportError:
|
| 33 |
HF_AVAILABLE = False
|
| 34 |
print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
logger = logging.getLogger(__name__)
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
class HuggingFacePusher:
|
| 39 |
+
"""Push trained models to Hugging Face Hub"""
|
| 40 |
|
| 41 |
def __init__(
|
| 42 |
self,
|
|
|
|
| 44 |
repo_name: str,
|
| 45 |
token: Optional[str] = None,
|
| 46 |
private: bool = False,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
author_name: Optional[str] = None,
|
| 48 |
model_description: Optional[str] = None,
|
|
|
|
| 49 |
model_name: Optional[str] = None,
|
| 50 |
+
dataset_name: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
):
|
| 52 |
self.model_path = Path(model_path)
|
| 53 |
# Original user input (may be just the repo name without username)
|
| 54 |
self.repo_name = repo_name
|
| 55 |
+
self.token = token or os.getenv('HF_TOKEN')
|
| 56 |
self.private = private
|
|
|
|
|
|
|
| 57 |
self.author_name = author_name
|
| 58 |
self.model_description = model_description
|
| 59 |
+
|
| 60 |
+
# Model card generation details
|
| 61 |
+
self.model_name = model_name
|
|
|
|
| 62 |
self.dataset_name = dataset_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
# Initialize HF API
|
| 65 |
if HF_AVAILABLE:
|
|
|
|
| 70 |
# Resolve the full repo id (username/repo) if user only provided repo name
|
| 71 |
self.repo_id = self._resolve_repo_id(self.repo_name)
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
logger.info(f"Initialized HuggingFacePusher for {self.repo_id}")
|
|
|
|
| 74 |
|
| 75 |
def _resolve_repo_id(self, repo_name: str) -> str:
|
| 76 |
"""Return a fully-qualified repo id in the form username/repo.
|
|
|
|
| 468 |
logger.error(f"β Failed to create README: {e}")
|
| 469 |
return False
|
| 470 |
|
| 471 |
+
|
| 472 |
+
def push_model(self, training_config: Optional[Dict[str, Any]] = None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
results: Optional[Dict[str, Any]] = None) -> bool:
|
| 474 |
+
"""Complete model push process"""
|
| 475 |
logger.info(f"π Starting model push to {self.repo_id}")
|
| 476 |
+
|
|
|
|
| 477 |
# Validate model path
|
| 478 |
if not self.validate_model_path():
|
| 479 |
return False
|
| 480 |
+
|
| 481 |
# Create repository
|
| 482 |
if not self.create_repository():
|
| 483 |
return False
|
| 484 |
+
|
| 485 |
# Load training config and results if not provided
|
| 486 |
if training_config is None:
|
| 487 |
training_config = self._load_training_config()
|
| 488 |
+
|
| 489 |
if results is None:
|
| 490 |
results = self._load_training_results()
|
| 491 |
+
|
| 492 |
# Create and upload model card
|
| 493 |
model_card = self.create_model_card(training_config, results)
|
| 494 |
model_card_path = Path("temp_model_card.md")
|
| 495 |
with open(model_card_path, "w") as f:
|
| 496 |
f.write(model_card)
|
| 497 |
+
|
| 498 |
try:
|
| 499 |
upload_file(
|
| 500 |
path_or_fileobj=str(model_card_path),
|
|
|
|
| 504 |
)
|
| 505 |
finally:
|
| 506 |
model_card_path.unlink()
|
| 507 |
+
|
| 508 |
# Upload model files
|
| 509 |
if not self.upload_model_files():
|
| 510 |
return False
|
| 511 |
+
|
| 512 |
# Upload training results
|
| 513 |
if results:
|
| 514 |
self.upload_training_results(str(self.model_path))
|
| 515 |
+
|
| 516 |
+
# Log success
|
| 517 |
+
logger.info(f"β
Model successfully pushed to {self.repo_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
logger.info(f"π Model successfully pushed to: https://huggingface.co/{self.repo_id}")
|
| 519 |
+
|
| 520 |
return True
|
| 521 |
+
|
| 522 |
+
def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool:
|
| 523 |
+
"""Push dataset to Hugging Face Hub"""
|
| 524 |
+
logger.info(f"π Starting dataset push to {dataset_repo_name}")
|
| 525 |
+
|
| 526 |
+
try:
|
| 527 |
+
from huggingface_hub import create_repo
|
| 528 |
+
import json
|
| 529 |
+
|
| 530 |
+
# Determine full dataset repo name
|
| 531 |
+
if "/" not in dataset_repo_name:
|
| 532 |
+
dataset_repo_name = f"{self.repo_id.split('/')[0]}/{dataset_repo_name}"
|
| 533 |
+
|
| 534 |
+
# Create dataset repository
|
| 535 |
+
try:
|
| 536 |
+
create_repo(dataset_repo_name, repo_type="dataset", token=self.token, exist_ok=True)
|
| 537 |
+
logger.info(f"β
Created dataset repository: {dataset_repo_name}")
|
| 538 |
+
except Exception as e:
|
| 539 |
+
if "already exists" not in str(e).lower():
|
| 540 |
+
logger.error(f"β Failed to create dataset repo: {e}")
|
| 541 |
+
return False
|
| 542 |
+
logger.info(f"π Dataset repository already exists: {dataset_repo_name}")
|
| 543 |
+
|
| 544 |
+
# Read the dataset file
|
| 545 |
+
dataset_file = Path(dataset_path)
|
| 546 |
+
if not dataset_file.exists():
|
| 547 |
+
logger.error(f"β Dataset file not found: {dataset_path}")
|
| 548 |
+
return False
|
| 549 |
+
|
| 550 |
+
# Count lines for metadata
|
| 551 |
+
with open(dataset_file, 'r', encoding='utf-8') as f:
|
| 552 |
+
num_examples = sum(1 for _ in f)
|
| 553 |
+
|
| 554 |
+
file_size = dataset_file.stat().st_size
|
| 555 |
+
|
| 556 |
+
# Upload the dataset file
|
| 557 |
+
upload_file(
|
| 558 |
+
path_or_fileobj=str(dataset_file),
|
| 559 |
+
path_in_repo="data.jsonl",
|
| 560 |
+
repo_id=dataset_repo_name,
|
| 561 |
+
repo_type="dataset",
|
| 562 |
+
token=self.token
|
| 563 |
+
)
|
| 564 |
+
logger.info(f"β
Uploaded dataset file: {dataset_file.name}")
|
| 565 |
+
|
| 566 |
+
# Create a dataset README
|
| 567 |
+
readme_content = f"""---
|
| 568 |
+
dataset_info:
|
| 569 |
+
features:
|
| 570 |
+
- name: audio_path
|
| 571 |
+
dtype: string
|
| 572 |
+
- name: text
|
| 573 |
+
dtype: string
|
| 574 |
+
splits:
|
| 575 |
+
- name: train
|
| 576 |
+
num_bytes: {file_size}
|
| 577 |
+
num_examples: {num_examples}
|
| 578 |
+
download_size: {file_size}
|
| 579 |
+
dataset_size: {file_size}
|
| 580 |
+
tags:
|
| 581 |
+
- voxtral
|
| 582 |
+
- asr
|
| 583 |
+
- fine-tuning
|
| 584 |
+
- conversational
|
| 585 |
+
- speech-to-text
|
| 586 |
+
- audio-to-text
|
| 587 |
+
- tonic
|
| 588 |
+
---
|
| 589 |
+
|
| 590 |
+
# Voxtral ASR Dataset
|
| 591 |
+
|
| 592 |
+
This dataset was created for fine-tuning Voxtral ASR models.
|
| 593 |
+
|
| 594 |
+
## Dataset Structure
|
| 595 |
+
|
| 596 |
+
- **audio_path**: Path to the audio file
|
| 597 |
+
- **text**: Transcription of the audio
|
| 598 |
+
|
| 599 |
+
## Statistics
|
| 600 |
+
|
| 601 |
+
- Number of examples: {num_examples}
|
| 602 |
+
- File size: {file_size} bytes
|
| 603 |
+
|
| 604 |
+
## Usage
|
| 605 |
+
|
| 606 |
+
```python
|
| 607 |
+
from datasets import load_dataset
|
| 608 |
+
|
| 609 |
+
dataset = load_dataset("{dataset_repo_name}")
|
| 610 |
+
```
|
| 611 |
+
"""
|
| 612 |
+
|
| 613 |
+
# Upload README
|
| 614 |
+
readme_path = dataset_file.parent / "README.md"
|
| 615 |
+
with open(readme_path, "w") as f:
|
| 616 |
+
f.write(readme_content)
|
| 617 |
+
|
| 618 |
+
upload_file(
|
| 619 |
+
path_or_fileobj=str(readme_path),
|
| 620 |
+
path_in_repo="README.md",
|
| 621 |
+
repo_id=dataset_repo_name,
|
| 622 |
+
repo_type="dataset",
|
| 623 |
+
token=self.token
|
| 624 |
+
)
|
| 625 |
+
|
| 626 |
+
readme_path.unlink() # Clean up temp file
|
| 627 |
+
|
| 628 |
+
logger.info(f"β
Dataset README uploaded")
|
| 629 |
+
logger.info(f"π Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}")
|
| 630 |
+
|
| 631 |
+
return True
|
| 632 |
+
|
| 633 |
+
except Exception as e:
|
| 634 |
+
logger.error(f"β Failed to push dataset: {e}")
|
| 635 |
+
return False
|
| 636 |
|
| 637 |
def _load_training_config(self) -> Dict[str, Any]:
|
| 638 |
"""Load training configuration"""
|
|
|
|
| 654 |
"""Parse command line arguments"""
|
| 655 |
parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')
|
| 656 |
|
| 657 |
+
# Subcommands
|
| 658 |
+
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
| 659 |
+
|
| 660 |
+
# Model push subcommand
|
| 661 |
+
model_parser = subparsers.add_parser('model', help='Push trained model to Hugging Face Hub')
|
| 662 |
+
model_parser.add_argument('model_path', type=str, help='Path to trained model directory')
|
| 663 |
+
model_parser.add_argument('repo_name', type=str, help='Hugging Face repository name (repo-name). Username will be auto-detected from your token.')
|
| 664 |
+
model_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
|
| 665 |
+
model_parser.add_argument('--private', action='store_true', help='Make repository private')
|
| 666 |
+
model_parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
|
| 667 |
+
model_parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
|
| 668 |
+
model_parser.add_argument('--model-name', type=str, default=None, help='Base model name')
|
| 669 |
+
model_parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
|
| 670 |
+
|
| 671 |
+
# Dataset push subcommand
|
| 672 |
+
dataset_parser = subparsers.add_parser('dataset', help='Push dataset to Hugging Face Hub')
|
| 673 |
+
dataset_parser.add_argument('dataset_path', type=str, help='Path to dataset JSONL file')
|
| 674 |
+
dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name')
|
| 675 |
+
dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
|
| 676 |
+
dataset_parser.add_argument('--private', action='store_true', help='Make repository private')
|
|
|
|
| 677 |
|
| 678 |
return parser.parse_args()
|
| 679 |
|
| 680 |
def main():
|
| 681 |
"""Main function"""
|
| 682 |
args = parse_args()
|
| 683 |
+
|
| 684 |
# Setup logging
|
| 685 |
logging.basicConfig(
|
| 686 |
level=logging.INFO,
|
| 687 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 688 |
)
|
| 689 |
+
|
| 690 |
+
if not args.command:
|
| 691 |
+
logger.error("β No command specified. Use 'model' or 'dataset' subcommand.")
|
| 692 |
+
return 1
|
| 693 |
+
|
| 694 |
try:
|
| 695 |
+
if args.command == 'model':
|
| 696 |
+
logger.info("Starting model push to Hugging Face Hub")
|
| 697 |
+
|
| 698 |
+
# Initialize pusher
|
| 699 |
+
pusher = HuggingFacePusher(
|
| 700 |
+
model_path=args.model_path,
|
| 701 |
+
repo_name=args.repo_name,
|
| 702 |
+
token=args.token,
|
| 703 |
+
private=args.private,
|
| 704 |
+
author_name=args.author_name,
|
| 705 |
+
model_description=args.model_description,
|
| 706 |
+
model_name=args.model_name,
|
| 707 |
+
dataset_name=args.dataset_name
|
| 708 |
+
)
|
| 709 |
+
|
| 710 |
+
# Push model
|
| 711 |
+
success = pusher.push_model()
|
| 712 |
+
|
| 713 |
+
if success:
|
| 714 |
+
logger.info("β
Model push completed successfully!")
|
| 715 |
+
logger.info(f"π View your model at: https://huggingface.co/{args.repo_name}")
|
| 716 |
+
else:
|
| 717 |
+
logger.error("β Model push failed!")
|
| 718 |
+
return 1
|
| 719 |
+
|
| 720 |
+
elif args.command == 'dataset':
|
| 721 |
+
logger.info("Starting dataset push to Hugging Face Hub")
|
| 722 |
+
|
| 723 |
+
# Initialize pusher for dataset
|
| 724 |
+
pusher = HuggingFacePusher(
|
| 725 |
+
model_path="", # Not needed for dataset push
|
| 726 |
+
repo_name=args.repo_name,
|
| 727 |
+
token=args.token,
|
| 728 |
+
private=args.private
|
| 729 |
+
)
|
| 730 |
+
|
| 731 |
+
# Push dataset
|
| 732 |
+
success = pusher.push_dataset(args.dataset_path, args.repo_name)
|
| 733 |
+
|
| 734 |
+
if success:
|
| 735 |
+
logger.info("β
Dataset push completed successfully!")
|
| 736 |
+
logger.info(f"π View your dataset at: https://huggingface.co/datasets/{args.repo_name}")
|
| 737 |
+
else:
|
| 738 |
+
logger.error("β Dataset push failed!")
|
| 739 |
+
return 1
|
| 740 |
+
|
| 741 |
except Exception as e:
|
| 742 |
+
logger.error(f"β Error during push: {e}")
|
| 743 |
return 1
|
| 744 |
+
|
| 745 |
return 0
|
| 746 |
|
| 747 |
if __name__ == "__main__":
|
scripts/train.py
CHANGED
|
@@ -1,8 +1,31 @@
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import argparse
|
| 4 |
import json
|
| 5 |
from pathlib import Path
|
|
|
|
|
|
|
| 6 |
import torch
|
| 7 |
from datasets import load_dataset, Audio, Dataset
|
| 8 |
from transformers import (
|
|
@@ -11,6 +34,85 @@ from transformers import (
|
|
| 11 |
Trainer,
|
| 12 |
TrainingArguments,
|
| 13 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
class VoxtralDataCollator:
|
|
@@ -161,6 +263,12 @@ def main():
|
|
| 161 |
parser.add_argument("--epochs", type=float, default=3)
|
| 162 |
parser.add_argument("--logging-steps", type=int, default=10)
|
| 163 |
parser.add_argument("--save-steps", type=int, default=50)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
args = parser.parse_args()
|
| 165 |
|
| 166 |
model_checkpoint = args.model_checkpoint
|
|
@@ -169,6 +277,48 @@ def main():
|
|
| 169 |
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 170 |
print(f"Using device: {torch_device}")
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
print("Loading processor and model...")
|
| 173 |
processor = VoxtralProcessor.from_pretrained(model_checkpoint)
|
| 174 |
model = VoxtralForConditionalGeneration.from_pretrained(
|
|
@@ -200,7 +350,7 @@ def main():
|
|
| 200 |
save_steps=args.save_steps,
|
| 201 |
eval_strategy="steps" if eval_dataset else "no",
|
| 202 |
save_strategy="steps",
|
| 203 |
-
report_to="
|
| 204 |
remove_unused_columns=False,
|
| 205 |
dataloader_num_workers=1,
|
| 206 |
)
|
|
@@ -223,6 +373,44 @@ def main():
|
|
| 223 |
if eval_dataset:
|
| 224 |
results = trainer.evaluate()
|
| 225 |
print(f"Final evaluation results: {results}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
print("Training completed successfully!")
|
| 228 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Voxtral ASR Full Fine-tuning Script with Trackio Integration
|
| 4 |
+
|
| 5 |
+
This script fine-tunes Voxtral models for ASR tasks with automatic experiment tracking
|
| 6 |
+
via Trackio and Hugging Face Spaces.
|
| 7 |
+
|
| 8 |
+
Features:
|
| 9 |
+
- Automatic username detection from HF_TOKEN environment variable
|
| 10 |
+
- Auto-generated space names with timestamps
|
| 11 |
+
- Local-only mode when no HF_TOKEN is set
|
| 12 |
+
- Comprehensive experiment logging
|
| 13 |
+
- Optional dataset pushing to Hugging Face Hub
|
| 14 |
+
|
| 15 |
+
Authentication:
|
| 16 |
+
Set HF_TOKEN environment variable to enable automatic space creation:
|
| 17 |
+
Linux/Mac: export HF_TOKEN=your_token_here
|
| 18 |
+
Windows: set HF_TOKEN=your_token_here
|
| 19 |
+
Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
|
| 20 |
+
|
| 21 |
+
Get your token from: https://huggingface.co/settings/tokens
|
| 22 |
+
"""
|
| 23 |
|
| 24 |
import argparse
|
| 25 |
import json
|
| 26 |
from pathlib import Path
|
| 27 |
+
from datetime import datetime
|
| 28 |
+
from typing import Tuple, Optional
|
| 29 |
import torch
|
| 30 |
from datasets import load_dataset, Audio, Dataset
|
| 31 |
from transformers import (
|
|
|
|
| 34 |
Trainer,
|
| 35 |
TrainingArguments,
|
| 36 |
)
|
| 37 |
+
from huggingface_hub import HfApi
|
| 38 |
+
import trackio
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
|
| 42 |
+
"""
|
| 43 |
+
Validate a Hugging Face token and return the username.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
token (str): The Hugging Face token to validate
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Tuple[bool, Optional[str], Optional[str]]:
|
| 50 |
+
- success: True if token is valid, False otherwise
|
| 51 |
+
- username: The username associated with the token (if valid)
|
| 52 |
+
- error_message: Error message if validation failed
|
| 53 |
+
"""
|
| 54 |
+
try:
|
| 55 |
+
# Create API client with token directly
|
| 56 |
+
api = HfApi(token=token)
|
| 57 |
+
|
| 58 |
+
# Try to get user info - this will fail if token is invalid
|
| 59 |
+
user_info = api.whoami()
|
| 60 |
+
|
| 61 |
+
# Extract username from user info
|
| 62 |
+
username = user_info.get("name", user_info.get("username"))
|
| 63 |
+
|
| 64 |
+
if not username:
|
| 65 |
+
return False, None, "Could not retrieve username from token"
|
| 66 |
+
|
| 67 |
+
return True, username, None
|
| 68 |
+
|
| 69 |
+
except Exception as e:
|
| 70 |
+
error_msg = str(e)
|
| 71 |
+
if "401" in error_msg or "unauthorized" in error_msg.lower():
|
| 72 |
+
return False, None, "Invalid token - unauthorized access"
|
| 73 |
+
elif "403" in error_msg:
|
| 74 |
+
return False, None, "Token lacks required permissions"
|
| 75 |
+
elif "network" in error_msg.lower() or "connection" in error_msg.lower():
|
| 76 |
+
return False, None, f"Network error: {error_msg}"
|
| 77 |
+
else:
|
| 78 |
+
return False, None, f"Validation error: {error_msg}"
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def get_default_space_name(project_type: str = "voxtral-asr-finetuning") -> str:
|
| 82 |
+
"""
|
| 83 |
+
Generate a default space name with username and timestamp.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
str: Default space name in format "username/project-type-timestamp"
|
| 90 |
+
"""
|
| 91 |
+
try:
|
| 92 |
+
# Get token from environment variables only
|
| 93 |
+
import os
|
| 94 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 95 |
+
|
| 96 |
+
if not token:
|
| 97 |
+
print("Warning: No HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable found.")
|
| 98 |
+
print("Set HF_TOKEN environment variable to enable automatic space creation.")
|
| 99 |
+
print("Example: export HF_TOKEN=your_token_here")
|
| 100 |
+
print("Falling back to local-only mode.")
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
# Validate token and get username
|
| 104 |
+
success, username, error = validate_hf_token(token)
|
| 105 |
+
if success and username:
|
| 106 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 107 |
+
return f"{username}/{project_type}-{timestamp}"
|
| 108 |
+
else:
|
| 109 |
+
print(f"Warning: Token validation failed: {error}")
|
| 110 |
+
print("Falling back to local-only mode.")
|
| 111 |
+
return None
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f"Warning: Failed to generate default space name: {e}")
|
| 115 |
+
return None
|
| 116 |
|
| 117 |
|
| 118 |
class VoxtralDataCollator:
|
|
|
|
| 263 |
parser.add_argument("--epochs", type=float, default=3)
|
| 264 |
parser.add_argument("--logging-steps", type=int, default=10)
|
| 265 |
parser.add_argument("--save-steps", type=int, default=50)
|
| 266 |
+
parser.add_argument("--trackio-space", type=str, default=None,
|
| 267 |
+
help="Hugging Face Space ID for trackio logging (format: username/space-name). If not provided, will auto-generate based on HF token")
|
| 268 |
+
parser.add_argument("--push-dataset", action="store_true",
|
| 269 |
+
help="Push the training dataset to Hugging Face Hub after training")
|
| 270 |
+
parser.add_argument("--dataset-repo", type=str, default=None,
|
| 271 |
+
help="Dataset repository name for pushing dataset (format: username/dataset-name)")
|
| 272 |
args = parser.parse_args()
|
| 273 |
|
| 274 |
model_checkpoint = args.model_checkpoint
|
|
|
|
| 277 |
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 278 |
print(f"Using device: {torch_device}")
|
| 279 |
|
| 280 |
+
# Determine trackio space
|
| 281 |
+
trackio_space = args.trackio_space
|
| 282 |
+
if not trackio_space:
|
| 283 |
+
trackio_space = get_default_space_name("voxtral-asr-finetuning")
|
| 284 |
+
|
| 285 |
+
# Initialize trackio for experiment tracking
|
| 286 |
+
if trackio_space:
|
| 287 |
+
print(f"Initializing trackio with space: {trackio_space}")
|
| 288 |
+
trackio.init(
|
| 289 |
+
project="voxtral-finetuning",
|
| 290 |
+
config={
|
| 291 |
+
"model_checkpoint": model_checkpoint,
|
| 292 |
+
"output_dir": output_dir,
|
| 293 |
+
"batch_size": args.batch_size,
|
| 294 |
+
"learning_rate": args.learning_rate,
|
| 295 |
+
"epochs": args.epochs,
|
| 296 |
+
"train_count": args.train_count,
|
| 297 |
+
"eval_count": args.eval_count,
|
| 298 |
+
"dataset_jsonl": args.dataset_jsonl,
|
| 299 |
+
"dataset_name": args.dataset_name,
|
| 300 |
+
"dataset_config": args.dataset_config,
|
| 301 |
+
},
|
| 302 |
+
space_id=trackio_space
|
| 303 |
+
)
|
| 304 |
+
else:
|
| 305 |
+
print("Initializing trackio in local-only mode")
|
| 306 |
+
trackio.init(
|
| 307 |
+
project="voxtral-finetuning",
|
| 308 |
+
config={
|
| 309 |
+
"model_checkpoint": model_checkpoint,
|
| 310 |
+
"output_dir": output_dir,
|
| 311 |
+
"batch_size": args.batch_size,
|
| 312 |
+
"learning_rate": args.learning_rate,
|
| 313 |
+
"epochs": args.epochs,
|
| 314 |
+
"train_count": args.train_count,
|
| 315 |
+
"eval_count": args.eval_count,
|
| 316 |
+
"dataset_jsonl": args.dataset_jsonl,
|
| 317 |
+
"dataset_name": args.dataset_name,
|
| 318 |
+
"dataset_config": args.dataset_config,
|
| 319 |
+
}
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
print("Loading processor and model...")
|
| 323 |
processor = VoxtralProcessor.from_pretrained(model_checkpoint)
|
| 324 |
model = VoxtralForConditionalGeneration.from_pretrained(
|
|
|
|
| 350 |
save_steps=args.save_steps,
|
| 351 |
eval_strategy="steps" if eval_dataset else "no",
|
| 352 |
save_strategy="steps",
|
| 353 |
+
report_to=["trackio"],
|
| 354 |
remove_unused_columns=False,
|
| 355 |
dataloader_num_workers=1,
|
| 356 |
)
|
|
|
|
| 373 |
if eval_dataset:
|
| 374 |
results = trainer.evaluate()
|
| 375 |
print(f"Final evaluation results: {results}")
|
| 376 |
+
# Log final evaluation results
|
| 377 |
+
trackio.log(results)
|
| 378 |
+
|
| 379 |
+
# Push dataset to Hub if requested
|
| 380 |
+
if args.push_dataset and args.dataset_jsonl:
|
| 381 |
+
print("Pushing dataset to Hugging Face Hub...")
|
| 382 |
+
try:
|
| 383 |
+
from pathlib import Path
|
| 384 |
+
import subprocess
|
| 385 |
+
|
| 386 |
+
dataset_repo = args.dataset_repo
|
| 387 |
+
if not dataset_repo:
|
| 388 |
+
# Auto-generate dataset repo name
|
| 389 |
+
if trackio_space:
|
| 390 |
+
username = trackio_space.split('/')[0]
|
| 391 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 392 |
+
dataset_repo = f"{username}/voxtral-dataset-{timestamp}"
|
| 393 |
+
else:
|
| 394 |
+
print("Warning: Cannot auto-generate dataset repo name without HF token")
|
| 395 |
+
dataset_repo = f"voxtral-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
| 396 |
+
|
| 397 |
+
# Call the push script
|
| 398 |
+
push_cmd = [
|
| 399 |
+
"python", str(Path(__file__).parent / "push_to_huggingface.py"),
|
| 400 |
+
"dataset", args.dataset_jsonl, dataset_repo
|
| 401 |
+
]
|
| 402 |
+
|
| 403 |
+
result = subprocess.run(push_cmd, capture_output=True, text=True)
|
| 404 |
+
if result.returncode == 0:
|
| 405 |
+
print(f"β
Dataset pushed to: https://huggingface.co/datasets/{dataset_repo}")
|
| 406 |
+
else:
|
| 407 |
+
print(f"β Failed to push dataset: {result.stderr}")
|
| 408 |
+
|
| 409 |
+
except Exception as e:
|
| 410 |
+
print(f"β Error pushing dataset: {e}")
|
| 411 |
+
|
| 412 |
+
# Finish trackio logging
|
| 413 |
+
trackio.finish()
|
| 414 |
|
| 415 |
print("Training completed successfully!")
|
| 416 |
|
scripts/train_lora.py
CHANGED
|
@@ -1,8 +1,32 @@
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import argparse
|
| 4 |
import json
|
| 5 |
from pathlib import Path
|
|
|
|
|
|
|
| 6 |
import torch
|
| 7 |
from datasets import load_dataset, Audio, Dataset
|
| 8 |
from transformers import (
|
|
@@ -12,6 +36,85 @@ from transformers import (
|
|
| 12 |
TrainingArguments,
|
| 13 |
)
|
| 14 |
from peft import LoraConfig, get_peft_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class VoxtralDataCollator:
|
|
@@ -163,6 +266,12 @@ def main():
|
|
| 163 |
parser.add_argument("--lora-alpha", type=int, default=32)
|
| 164 |
parser.add_argument("--lora-dropout", type=float, default=0.0)
|
| 165 |
parser.add_argument("--freeze-audio-tower", action="store_true", help="Freeze audio encoder parameters")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
args = parser.parse_args()
|
| 167 |
|
| 168 |
model_checkpoint = args.model_checkpoint
|
|
@@ -171,6 +280,56 @@ def main():
|
|
| 171 |
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 172 |
print(f"Using device: {torch_device}")
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
print("Loading processor and model...")
|
| 175 |
processor = VoxtralProcessor.from_pretrained(model_checkpoint)
|
| 176 |
lora_cfg = LoraConfig(
|
|
@@ -210,12 +369,12 @@ def main():
|
|
| 210 |
learning_rate=args.learning_rate,
|
| 211 |
num_train_epochs=args.epochs,
|
| 212 |
bf16=True,
|
| 213 |
-
logging_steps=args.
|
| 214 |
eval_steps=args.save_steps if eval_dataset else None,
|
| 215 |
save_steps=args.save_steps,
|
| 216 |
eval_strategy="steps" if eval_dataset else "no",
|
| 217 |
save_strategy="steps",
|
| 218 |
-
report_to="
|
| 219 |
remove_unused_columns=False,
|
| 220 |
dataloader_num_workers=1,
|
| 221 |
)
|
|
@@ -238,6 +397,44 @@ def main():
|
|
| 238 |
if eval_dataset:
|
| 239 |
results = trainer.evaluate()
|
| 240 |
print(f"Final evaluation results: {results}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
print("Training completed successfully!")
|
| 243 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Voxtral ASR LoRA Fine-tuning Script with Trackio Integration
|
| 4 |
+
|
| 5 |
+
This script fine-tunes Voxtral models using LoRA for ASR tasks with automatic experiment tracking
|
| 6 |
+
via Trackio and Hugging Face Spaces.
|
| 7 |
+
|
| 8 |
+
Features:
|
| 9 |
+
- Automatic username detection from HF_TOKEN environment variable
|
| 10 |
+
- Auto-generated space names with timestamps
|
| 11 |
+
- Local-only mode when no HF_TOKEN is set
|
| 12 |
+
- Comprehensive experiment logging
|
| 13 |
+
- LoRA-specific hyperparameters tracking
|
| 14 |
+
- Optional dataset pushing to Hugging Face Hub
|
| 15 |
+
|
| 16 |
+
Authentication:
|
| 17 |
+
Set HF_TOKEN environment variable to enable automatic space creation:
|
| 18 |
+
Linux/Mac: export HF_TOKEN=your_token_here
|
| 19 |
+
Windows: set HF_TOKEN=your_token_here
|
| 20 |
+
Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
|
| 21 |
+
|
| 22 |
+
Get your token from: https://huggingface.co/settings/tokens
|
| 23 |
+
"""
|
| 24 |
|
| 25 |
import argparse
|
| 26 |
import json
|
| 27 |
from pathlib import Path
|
| 28 |
+
from datetime import datetime
|
| 29 |
+
from typing import Tuple, Optional
|
| 30 |
import torch
|
| 31 |
from datasets import load_dataset, Audio, Dataset
|
| 32 |
from transformers import (
|
|
|
|
| 36 |
TrainingArguments,
|
| 37 |
)
|
| 38 |
from peft import LoraConfig, get_peft_model
|
| 39 |
+
from huggingface_hub import HfApi
|
| 40 |
+
import trackio
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
|
| 44 |
+
"""
|
| 45 |
+
Validate a Hugging Face token and return the username.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
token (str): The Hugging Face token to validate
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
Tuple[bool, Optional[str], Optional[str]]:
|
| 52 |
+
- success: True if token is valid, False otherwise
|
| 53 |
+
- username: The username associated with the token (if valid)
|
| 54 |
+
- error_message: Error message if validation failed
|
| 55 |
+
"""
|
| 56 |
+
try:
|
| 57 |
+
# Create API client with token directly
|
| 58 |
+
api = HfApi(token=token)
|
| 59 |
+
|
| 60 |
+
# Try to get user info - this will fail if token is invalid
|
| 61 |
+
user_info = api.whoami()
|
| 62 |
+
|
| 63 |
+
# Extract username from user info
|
| 64 |
+
username = user_info.get("name", user_info.get("username"))
|
| 65 |
+
|
| 66 |
+
if not username:
|
| 67 |
+
return False, None, "Could not retrieve username from token"
|
| 68 |
+
|
| 69 |
+
return True, username, None
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
error_msg = str(e)
|
| 73 |
+
if "401" in error_msg or "unauthorized" in error_msg.lower():
|
| 74 |
+
return False, None, "Invalid token - unauthorized access"
|
| 75 |
+
elif "403" in error_msg:
|
| 76 |
+
return False, None, "Token lacks required permissions"
|
| 77 |
+
elif "network" in error_msg.lower() or "connection" in error_msg.lower():
|
| 78 |
+
return False, None, f"Network error: {error_msg}"
|
| 79 |
+
else:
|
| 80 |
+
return False, None, f"Validation error: {error_msg}"
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def get_default_space_name(project_type: str = "voxtral-lora-finetuning") -> str:
|
| 84 |
+
"""
|
| 85 |
+
Generate a default space name with username and timestamp.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
str: Default space name in format "username/project-type-timestamp"
|
| 92 |
+
"""
|
| 93 |
+
try:
|
| 94 |
+
# Get token from environment variables only
|
| 95 |
+
import os
|
| 96 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 97 |
+
|
| 98 |
+
if not token:
|
| 99 |
+
print("Warning: No HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable found.")
|
| 100 |
+
print("Set HF_TOKEN environment variable to enable automatic space creation.")
|
| 101 |
+
print("Example: export HF_TOKEN=your_token_here")
|
| 102 |
+
print("Falling back to local-only mode.")
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
# Validate token and get username
|
| 106 |
+
success, username, error = validate_hf_token(token)
|
| 107 |
+
if success and username:
|
| 108 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 109 |
+
return f"{username}/{project_type}-{timestamp}"
|
| 110 |
+
else:
|
| 111 |
+
print(f"Warning: Token validation failed: {error}")
|
| 112 |
+
print("Falling back to local-only mode.")
|
| 113 |
+
return None
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"Warning: Failed to generate default space name: {e}")
|
| 117 |
+
return None
|
| 118 |
|
| 119 |
|
| 120 |
class VoxtralDataCollator:
|
|
|
|
| 266 |
parser.add_argument("--lora-alpha", type=int, default=32)
|
| 267 |
parser.add_argument("--lora-dropout", type=float, default=0.0)
|
| 268 |
parser.add_argument("--freeze-audio-tower", action="store_true", help="Freeze audio encoder parameters")
|
| 269 |
+
parser.add_argument("--trackio-space", type=str, default=None,
|
| 270 |
+
help="Hugging Face Space ID for trackio logging (format: username/space-name). If not provided, will auto-generate based on HF token")
|
| 271 |
+
parser.add_argument("--push-dataset", action="store_true",
|
| 272 |
+
help="Push the training dataset to Hugging Face Hub after training")
|
| 273 |
+
parser.add_argument("--dataset-repo", type=str, default=None,
|
| 274 |
+
help="Dataset repository name for pushing dataset (format: username/dataset-name)")
|
| 275 |
args = parser.parse_args()
|
| 276 |
|
| 277 |
model_checkpoint = args.model_checkpoint
|
|
|
|
| 280 |
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 281 |
print(f"Using device: {torch_device}")
|
| 282 |
|
| 283 |
+
# Determine trackio space
|
| 284 |
+
trackio_space = args.trackio_space
|
| 285 |
+
if not trackio_space:
|
| 286 |
+
trackio_space = get_default_space_name("voxtral-lora-finetuning")
|
| 287 |
+
|
| 288 |
+
# Initialize trackio for experiment tracking
|
| 289 |
+
if trackio_space:
|
| 290 |
+
print(f"Initializing trackio with space: {trackio_space}")
|
| 291 |
+
trackio.init(
|
| 292 |
+
project="voxtral-lora-finetuning",
|
| 293 |
+
config={
|
| 294 |
+
"model_checkpoint": model_checkpoint,
|
| 295 |
+
"output_dir": output_dir,
|
| 296 |
+
"batch_size": args.batch_size,
|
| 297 |
+
"learning_rate": args.learning_rate,
|
| 298 |
+
"epochs": args.epochs,
|
| 299 |
+
"train_count": args.train_count,
|
| 300 |
+
"eval_count": args.eval_count,
|
| 301 |
+
"dataset_jsonl": args.dataset_jsonl,
|
| 302 |
+
"dataset_name": args.dataset_name,
|
| 303 |
+
"dataset_config": args.dataset_config,
|
| 304 |
+
"lora_r": args.lora_r,
|
| 305 |
+
"lora_alpha": args.lora_alpha,
|
| 306 |
+
"lora_dropout": args.lora_dropout,
|
| 307 |
+
"freeze_audio_tower": args.freeze_audio_tower,
|
| 308 |
+
},
|
| 309 |
+
space_id=trackio_space
|
| 310 |
+
)
|
| 311 |
+
else:
|
| 312 |
+
print("Initializing trackio in local-only mode")
|
| 313 |
+
trackio.init(
|
| 314 |
+
project="voxtral-lora-finetuning",
|
| 315 |
+
config={
|
| 316 |
+
"model_checkpoint": model_checkpoint,
|
| 317 |
+
"output_dir": output_dir,
|
| 318 |
+
"batch_size": args.batch_size,
|
| 319 |
+
"learning_rate": args.learning_rate,
|
| 320 |
+
"epochs": args.epochs,
|
| 321 |
+
"train_count": args.train_count,
|
| 322 |
+
"eval_count": args.eval_count,
|
| 323 |
+
"dataset_jsonl": args.dataset_jsonl,
|
| 324 |
+
"dataset_name": args.dataset_name,
|
| 325 |
+
"dataset_config": args.dataset_config,
|
| 326 |
+
"lora_r": args.lora_r,
|
| 327 |
+
"lora_alpha": args.lora_alpha,
|
| 328 |
+
"lora_dropout": args.lora_dropout,
|
| 329 |
+
"freeze_audio_tower": args.freeze_audio_tower,
|
| 330 |
+
}
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
print("Loading processor and model...")
|
| 334 |
processor = VoxtralProcessor.from_pretrained(model_checkpoint)
|
| 335 |
lora_cfg = LoraConfig(
|
|
|
|
| 369 |
learning_rate=args.learning_rate,
|
| 370 |
num_train_epochs=args.epochs,
|
| 371 |
bf16=True,
|
| 372 |
+
logging_steps=args.logging_steps,
|
| 373 |
eval_steps=args.save_steps if eval_dataset else None,
|
| 374 |
save_steps=args.save_steps,
|
| 375 |
eval_strategy="steps" if eval_dataset else "no",
|
| 376 |
save_strategy="steps",
|
| 377 |
+
report_to=["trackio"],
|
| 378 |
remove_unused_columns=False,
|
| 379 |
dataloader_num_workers=1,
|
| 380 |
)
|
|
|
|
| 397 |
if eval_dataset:
|
| 398 |
results = trainer.evaluate()
|
| 399 |
print(f"Final evaluation results: {results}")
|
| 400 |
+
# Log final evaluation results
|
| 401 |
+
trackio.log(results)
|
| 402 |
+
|
| 403 |
+
# Push dataset to Hub if requested
|
| 404 |
+
if args.push_dataset and args.dataset_jsonl:
|
| 405 |
+
print("Pushing dataset to Hugging Face Hub...")
|
| 406 |
+
try:
|
| 407 |
+
from pathlib import Path
|
| 408 |
+
import subprocess
|
| 409 |
+
|
| 410 |
+
dataset_repo = args.dataset_repo
|
| 411 |
+
if not dataset_repo:
|
| 412 |
+
# Auto-generate dataset repo name
|
| 413 |
+
if trackio_space:
|
| 414 |
+
username = trackio_space.split('/')[0]
|
| 415 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 416 |
+
dataset_repo = f"{username}/voxtral-dataset-{timestamp}"
|
| 417 |
+
else:
|
| 418 |
+
print("Warning: Cannot auto-generate dataset repo name without HF token")
|
| 419 |
+
dataset_repo = f"voxtral-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
| 420 |
+
|
| 421 |
+
# Call the push script
|
| 422 |
+
push_cmd = [
|
| 423 |
+
"python", str(Path(__file__).parent / "push_to_huggingface.py"),
|
| 424 |
+
"dataset", args.dataset_jsonl, dataset_repo
|
| 425 |
+
]
|
| 426 |
+
|
| 427 |
+
result = subprocess.run(push_cmd, capture_output=True, text=True)
|
| 428 |
+
if result.returncode == 0:
|
| 429 |
+
print(f"β
Dataset pushed to: https://huggingface.co/datasets/{dataset_repo}")
|
| 430 |
+
else:
|
| 431 |
+
print(f"β Failed to push dataset: {result.stderr}")
|
| 432 |
+
|
| 433 |
+
except Exception as e:
|
| 434 |
+
print(f"β Error pushing dataset: {e}")
|
| 435 |
+
|
| 436 |
+
# Finish trackio logging
|
| 437 |
+
trackio.finish()
|
| 438 |
|
| 439 |
print("Training completed successfully!")
|
| 440 |
|
templates/datasets/readme.md
DELETED
|
@@ -1,171 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
dataset_info:
|
| 3 |
-
features:
|
| 4 |
-
- name: experiment_id
|
| 5 |
-
dtype: string
|
| 6 |
-
- name: name
|
| 7 |
-
dtype: string
|
| 8 |
-
- name: description
|
| 9 |
-
dtype: string
|
| 10 |
-
- name: created_at
|
| 11 |
-
dtype: string
|
| 12 |
-
- name: status
|
| 13 |
-
dtype: string
|
| 14 |
-
- name: metrics
|
| 15 |
-
dtype: string
|
| 16 |
-
- name: parameters
|
| 17 |
-
dtype: string
|
| 18 |
-
- name: artifacts
|
| 19 |
-
dtype: string
|
| 20 |
-
- name: logs
|
| 21 |
-
dtype: string
|
| 22 |
-
- name: last_updated
|
| 23 |
-
dtype: string
|
| 24 |
-
splits:
|
| 25 |
-
- name: train
|
| 26 |
-
num_bytes: 4945
|
| 27 |
-
num_examples: 2
|
| 28 |
-
download_size: 15529
|
| 29 |
-
dataset_size: 4945
|
| 30 |
-
configs:
|
| 31 |
-
- config_name: default
|
| 32 |
-
data_files:
|
| 33 |
-
- split: train
|
| 34 |
-
path: data/train-*
|
| 35 |
-
tags:
|
| 36 |
-
- track tonic
|
| 37 |
-
- tonic
|
| 38 |
-
- experiment tracking
|
| 39 |
-
- smollm3
|
| 40 |
-
- fine-tuning
|
| 41 |
-
- legml
|
| 42 |
-
- hermes
|
| 43 |
-
---
|
| 44 |
-
|
| 45 |
-
# Trackio Experiments Dataset
|
| 46 |
-
|
| 47 |
-
This dataset stores experiment tracking data for ML training runs, particularly focused on SmolLM3 fine-tuning experiments with comprehensive metrics tracking.
|
| 48 |
-
|
| 49 |
-
## Dataset Structure
|
| 50 |
-
|
| 51 |
-
The dataset contains the following columns:
|
| 52 |
-
|
| 53 |
-
- **experiment_id**: Unique identifier for each experiment
|
| 54 |
-
- **name**: Human-readable name for the experiment
|
| 55 |
-
- **description**: Detailed description of the experiment
|
| 56 |
-
- **created_at**: Timestamp when the experiment was created
|
| 57 |
-
- **status**: Current status (running, completed, failed, paused)
|
| 58 |
-
- **metrics**: JSON string containing training metrics over time
|
| 59 |
-
- **parameters**: JSON string containing experiment configuration
|
| 60 |
-
- **artifacts**: JSON string containing experiment artifacts
|
| 61 |
-
- **logs**: JSON string containing experiment logs
|
| 62 |
-
- **last_updated**: Timestamp of last update
|
| 63 |
-
|
| 64 |
-
## Metrics Structure
|
| 65 |
-
|
| 66 |
-
The metrics field contains JSON arrays with the following structure:
|
| 67 |
-
|
| 68 |
-
```json
|
| 69 |
-
[
|
| 70 |
-
{
|
| 71 |
-
"timestamp": "2025-07-20T11:20:01.780908",
|
| 72 |
-
"step": 25,
|
| 73 |
-
"metrics": {
|
| 74 |
-
"loss": 1.1659,
|
| 75 |
-
"accuracy": 0.759,
|
| 76 |
-
"learning_rate": 7e-08,
|
| 77 |
-
"grad_norm": 10.3125,
|
| 78 |
-
"epoch": 0.004851130919895701,
|
| 79 |
-
|
| 80 |
-
// Advanced Training Metrics
|
| 81 |
-
"total_tokens": 1642080.0,
|
| 82 |
-
"truncated_tokens": 128,
|
| 83 |
-
"padding_tokens": 256,
|
| 84 |
-
"throughput": 3284160.0,
|
| 85 |
-
"step_time": 0.5,
|
| 86 |
-
"batch_size": 8,
|
| 87 |
-
"seq_len": 2048,
|
| 88 |
-
"token_acc": 0.759,
|
| 89 |
-
|
| 90 |
-
// Custom Losses
|
| 91 |
-
"train/gate_ortho": 0.0234,
|
| 92 |
-
"train/center": 0.0156,
|
| 93 |
-
|
| 94 |
-
// System Metrics
|
| 95 |
-
"gpu_memory_allocated": 17.202261447906494,
|
| 96 |
-
"gpu_memory_reserved": 75.474609375,
|
| 97 |
-
"gpu_utilization": 85.2,
|
| 98 |
-
"cpu_percent": 2.7,
|
| 99 |
-
"memory_percent": 10.1
|
| 100 |
-
}
|
| 101 |
-
}
|
| 102 |
-
]
|
| 103 |
-
```
|
| 104 |
-
|
| 105 |
-
## Supported Metrics
|
| 106 |
-
|
| 107 |
-
### Core Training Metrics
|
| 108 |
-
- **loss**: Training loss value
|
| 109 |
-
- **accuracy**: Model accuracy
|
| 110 |
-
- **learning_rate**: Current learning rate
|
| 111 |
-
- **grad_norm**: Gradient norm
|
| 112 |
-
- **epoch**: Current epoch progress
|
| 113 |
-
|
| 114 |
-
### Advanced Token Metrics
|
| 115 |
-
- **total_tokens**: Total tokens processed in the batch
|
| 116 |
-
- **truncated_tokens**: Number of tokens truncated during processing
|
| 117 |
-
- **padding_tokens**: Number of padding tokens added
|
| 118 |
-
- **throughput**: Tokens processed per second
|
| 119 |
-
- **step_time**: Time taken for the current training step
|
| 120 |
-
- **batch_size**: Current batch size
|
| 121 |
-
- **seq_len**: Sequence length
|
| 122 |
-
- **token_acc**: Token-level accuracy
|
| 123 |
-
|
| 124 |
-
### Custom Losses (SmolLM3-specific)
|
| 125 |
-
- **train/gate_ortho**: Gate orthogonality loss
|
| 126 |
-
- **train/center**: Center loss component
|
| 127 |
-
|
| 128 |
-
### System Performance Metrics
|
| 129 |
-
- **gpu_memory_allocated**: GPU memory currently allocated (GB)
|
| 130 |
-
- **gpu_memory_reserved**: GPU memory reserved (GB)
|
| 131 |
-
- **gpu_utilization**: GPU utilization percentage
|
| 132 |
-
- **cpu_percent**: CPU usage percentage
|
| 133 |
-
- **memory_percent**: System memory usage percentage
|
| 134 |
-
|
| 135 |
-
## Usage
|
| 136 |
-
|
| 137 |
-
This dataset is automatically used by the Trackio monitoring system to store and retrieve experiment data. It provides persistent storage for experiment tracking across different training runs.
|
| 138 |
-
|
| 139 |
-
## Integration
|
| 140 |
-
|
| 141 |
-
The dataset is used by:
|
| 142 |
-
- Trackio Spaces for experiment visualization
|
| 143 |
-
- Training scripts for logging metrics and parameters
|
| 144 |
-
- Monitoring systems for experiment tracking
|
| 145 |
-
- SmolLM3 fine-tuning pipeline for comprehensive metrics capture
|
| 146 |
-
|
| 147 |
-
## Privacy
|
| 148 |
-
|
| 149 |
-
This dataset is private by default to ensure experiment data security. Only users with appropriate permissions can access the data.
|
| 150 |
-
|
| 151 |
-
## Examples
|
| 152 |
-
|
| 153 |
-
### Sample Experiment Entry
|
| 154 |
-
```json
|
| 155 |
-
{
|
| 156 |
-
"experiment_id": "exp_20250720_130853",
|
| 157 |
-
"name": "smollm3_finetune",
|
| 158 |
-
"description": "SmolLM3 fine-tuning experiment with comprehensive metrics",
|
| 159 |
-
"created_at": "2025-07-20T11:20:01.780908",
|
| 160 |
-
"status": "running",
|
| 161 |
-
"metrics": "[{\"timestamp\": \"2025-07-20T11:20:01.780908\", \"step\": 25, \"metrics\": {\"loss\": 1.1659, \"accuracy\": 0.759, \"total_tokens\": 1642080.0, \"throughput\": 3284160.0, \"train/gate_ortho\": 0.0234, \"train/center\": 0.0156}}]",
|
| 162 |
-
"parameters": "{\"model_name\": \"HuggingFaceTB/SmolLM3-3B\", \"batch_size\": 8, \"learning_rate\": 3.5e-06, \"max_seq_length\": 12288}",
|
| 163 |
-
"artifacts": "[]",
|
| 164 |
-
"logs": "[]",
|
| 165 |
-
"last_updated": "2025-07-20T11:20:01.780908"
|
| 166 |
-
}
|
| 167 |
-
```
|
| 168 |
-
|
| 169 |
-
## License
|
| 170 |
-
|
| 171 |
-
This dataset is part of the Trackio experiment tracking system and follows the same license as the main project.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_hf_setup.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test Hugging Face Setup for Trackio Integration
|
| 4 |
+
|
| 5 |
+
This script helps verify your Hugging Face token setup and test space name generation.
|
| 6 |
+
Run this before using the training scripts to ensure everything is configured correctly.
|
| 7 |
+
|
| 8 |
+
Authentication:
|
| 9 |
+
This script only checks for HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variables.
|
| 10 |
+
It does NOT use huggingface-cli login state.
|
| 11 |
+
|
| 12 |
+
Setup:
|
| 13 |
+
Linux/Mac: export HF_TOKEN=your_token_here
|
| 14 |
+
Windows: set HF_TOKEN=your_token_here
|
| 15 |
+
Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
|
| 16 |
+
|
| 17 |
+
Get your token from: https://huggingface.co/settings/tokens
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import os
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
from typing import Tuple, Optional
|
| 23 |
+
from huggingface_hub import HfApi
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
|
| 27 |
+
"""
|
| 28 |
+
Validate a Hugging Face token and return the username.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
token (str): The Hugging Face token to validate
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Tuple[bool, Optional[str], Optional[str]]:
|
| 35 |
+
- success: True if token is valid, False otherwise
|
| 36 |
+
- username: The username associated with the token (if valid)
|
| 37 |
+
- error_message: Error message if validation failed
|
| 38 |
+
"""
|
| 39 |
+
try:
|
| 40 |
+
# Create API client with token directly
|
| 41 |
+
api = HfApi(token=token)
|
| 42 |
+
|
| 43 |
+
# Try to get user info - this will fail if token is invalid
|
| 44 |
+
user_info = api.whoami()
|
| 45 |
+
|
| 46 |
+
# Extract username from user info
|
| 47 |
+
username = user_info.get("name", user_info.get("username"))
|
| 48 |
+
|
| 49 |
+
if not username:
|
| 50 |
+
return False, None, "Could not retrieve username from token"
|
| 51 |
+
|
| 52 |
+
return True, username, None
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
error_msg = str(e)
|
| 56 |
+
if "401" in error_msg or "unauthorized" in error_msg.lower():
|
| 57 |
+
return False, None, "Invalid token - unauthorized access"
|
| 58 |
+
elif "403" in error_msg:
|
| 59 |
+
return False, None, "Token lacks required permissions"
|
| 60 |
+
elif "network" in error_msg.lower() or "connection" in error_msg.lower():
|
| 61 |
+
return False, None, f"Network error: {error_msg}"
|
| 62 |
+
else:
|
| 63 |
+
return False, None, f"Validation error: {error_msg}"
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_default_space_name(project_type: str = "voxtral-asr-finetuning") -> str:
|
| 67 |
+
"""
|
| 68 |
+
Generate a default space name with username and timestamp.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
str: Default space name in format "username/project-type-timestamp"
|
| 75 |
+
"""
|
| 76 |
+
try:
|
| 77 |
+
# Get token from environment variables only
|
| 78 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 79 |
+
|
| 80 |
+
if not token:
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
# Validate token and get username
|
| 84 |
+
success, username, error = validate_hf_token(token)
|
| 85 |
+
if success and username:
|
| 86 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
| 87 |
+
return f"{username}/{project_type}-{timestamp}"
|
| 88 |
+
else:
|
| 89 |
+
return None
|
| 90 |
+
|
| 91 |
+
except Exception as e:
|
| 92 |
+
print(f"Failed to generate default space name: {e}")
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def main():
|
| 97 |
+
print("π Testing Hugging Face Setup for Trackio Integration")
|
| 98 |
+
print("=" * 60)
|
| 99 |
+
|
| 100 |
+
# Check for tokens
|
| 101 |
+
print("\n1. Checking for Hugging Face tokens...")
|
| 102 |
+
|
| 103 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 104 |
+
if token:
|
| 105 |
+
print(f"β
Found token in environment: {token[:10]}...")
|
| 106 |
+
else:
|
| 107 |
+
print("β No token found in environment variables")
|
| 108 |
+
print("\nβ No Hugging Face token found!")
|
| 109 |
+
print("Please set the HF_TOKEN environment variable:")
|
| 110 |
+
print(" Linux/Mac: export HF_TOKEN=your_token_here")
|
| 111 |
+
print(" Windows: set HF_TOKEN=your_token_here")
|
| 112 |
+
print(" Or: set HUGGINGFACE_HUB_TOKEN=your_token_here")
|
| 113 |
+
print("\nGet your token from: https://huggingface.co/settings/tokens")
|
| 114 |
+
return
|
| 115 |
+
|
| 116 |
+
# Validate token
|
| 117 |
+
print("\n2. Validating token...")
|
| 118 |
+
success, username, error = validate_hf_token(token)
|
| 119 |
+
|
| 120 |
+
if success:
|
| 121 |
+
print(f"β
Token is valid! Username: {username}")
|
| 122 |
+
else:
|
| 123 |
+
print(f"β Token validation failed: {error}")
|
| 124 |
+
return
|
| 125 |
+
|
| 126 |
+
# Generate space names
|
| 127 |
+
print("\n3. Generating default space names...")
|
| 128 |
+
|
| 129 |
+
full_finetune_space = get_default_space_name("voxtral-asr-finetuning")
|
| 130 |
+
lora_finetune_space = get_default_space_name("voxtral-lora-finetuning")
|
| 131 |
+
|
| 132 |
+
print(f"π Full fine-tuning space: {full_finetune_space}")
|
| 133 |
+
print(f"π LoRA fine-tuning space: {lora_finetune_space}")
|
| 134 |
+
|
| 135 |
+
print("\nβ
Setup complete! You can now run training scripts.")
|
| 136 |
+
print(" They will automatically use the generated space names.")
|
| 137 |
+
print("\nπ‘ To override the auto-generated names, use --trackio-space yourname/custom-space")
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
main()
|