Spaces:
Running
Running
Joseph Pollack
commited on
adds automatic authentication , dataset readme , push to hub automation , demo , readme , and interface improvements
Browse files- interface.py +120 -3
- requirements.txt +3 -1
- scripts/push_to_huggingface.py +223 -175
- scripts/train.py +189 -1
- scripts/train_lora.py +199 -2
- templates/datasets/readme.md +0 -171
- tests/test_hf_setup.py +141 -0
interface.py
CHANGED
@@ -155,6 +155,104 @@ def _save_uploaded_dataset(files: list, transcripts: list[str]) -> str:
|
|
155 |
return str(jsonl_path)
|
156 |
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
def _save_recordings(recordings: list[tuple[int, list]], transcripts: list[str]) -> str:
|
159 |
import soundfile as sf
|
160 |
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
@@ -231,6 +329,7 @@ def start_voxtral_training(
|
|
231 |
repo_name = f"{username}/{repo_short}" if username else repo_short
|
232 |
push_args = [
|
233 |
str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
|
|
|
234 |
str(output_dir),
|
235 |
repo_name,
|
236 |
]
|
@@ -519,6 +618,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
519 |
gr.update(visible=True), # dataset_status
|
520 |
gr.update(visible=True), # advanced_accordion
|
521 |
gr.update(visible=True), # save_rec_btn
|
|
|
522 |
gr.update(visible=True), # start_btn
|
523 |
gr.update(visible=True), # logs_box
|
524 |
]
|
@@ -607,17 +707,27 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
607 |
gr.Markdown("### Upload audio + transcripts (optional)")
|
608 |
upload_audio = gr.File(file_count="multiple", type="filepath", label="Upload WAV/FLAC files (optional)")
|
609 |
transcripts_box = gr.Textbox(lines=6, label="Transcripts (one per line, aligned with files)")
|
|
|
|
|
610 |
save_upload_btn = gr.Button("Save uploaded dataset")
|
|
|
611 |
|
612 |
def _collect_upload(files, txt):
|
613 |
lines = [s.strip() for s in (txt or "").splitlines() if s.strip()]
|
614 |
-
|
|
|
615 |
|
616 |
-
|
617 |
-
|
|
|
|
|
|
|
|
|
|
|
618 |
|
619 |
# Save recordings button
|
620 |
save_rec_btn = gr.Button("Save recordings as dataset", visible=False)
|
|
|
621 |
|
622 |
def _collect_preloaded_recs(*recs_and_texts):
|
623 |
import soundfile as sf
|
@@ -646,6 +756,13 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
|
|
646 |
|
647 |
save_rec_btn.click(_collect_preloaded_recs, rec_components + [phrase_texts_state], [jsonl_path_state])
|
648 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
649 |
# Removed multilingual dataset sample section - phrases are now loaded automatically when language is selected
|
650 |
|
651 |
start_btn = gr.Button("Start Fine-tuning", visible=False)
|
|
|
155 |
return str(jsonl_path)
|
156 |
|
157 |
|
158 |
+
def _push_dataset_to_hub(jsonl_path: str, repo_name: str, username: str = "") -> str:
|
159 |
+
"""Push dataset to Hugging Face Hub"""
|
160 |
+
try:
|
161 |
+
from huggingface_hub import HfApi, create_repo
|
162 |
+
import json
|
163 |
+
from pathlib import Path
|
164 |
+
|
165 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
166 |
+
|
167 |
+
if not token:
|
168 |
+
return "β No HF_TOKEN found. Set HF_TOKEN environment variable to push datasets."
|
169 |
+
|
170 |
+
api = HfApi(token=token)
|
171 |
+
|
172 |
+
# Determine full repo name
|
173 |
+
if "/" not in repo_name:
|
174 |
+
if not username:
|
175 |
+
user_info = api.whoami()
|
176 |
+
username = user_info.get("name") or user_info.get("username") or ""
|
177 |
+
if username:
|
178 |
+
repo_name = f"{username}/{repo_name}"
|
179 |
+
|
180 |
+
# Create dataset repository
|
181 |
+
try:
|
182 |
+
create_repo(repo_name, repo_type="dataset", token=token, exist_ok=True)
|
183 |
+
except Exception as e:
|
184 |
+
if "already exists" not in str(e).lower():
|
185 |
+
return f"β Failed to create dataset repo: {e}"
|
186 |
+
|
187 |
+
# Read the JSONL file
|
188 |
+
jsonl_file = Path(jsonl_path)
|
189 |
+
if not jsonl_file.exists():
|
190 |
+
return f"β Dataset file not found: {jsonl_path}"
|
191 |
+
|
192 |
+
# Upload the JSONL file
|
193 |
+
api.upload_file(
|
194 |
+
path_or_fileobj=str(jsonl_file),
|
195 |
+
path_in_repo="data.jsonl",
|
196 |
+
repo_id=repo_name,
|
197 |
+
repo_type="dataset",
|
198 |
+
token=token
|
199 |
+
)
|
200 |
+
|
201 |
+
# Create a simple README for the dataset
|
202 |
+
readme_content = f"""---
|
203 |
+
dataset_info:
|
204 |
+
features:
|
205 |
+
- name: audio_path
|
206 |
+
dtype: string
|
207 |
+
- name: text
|
208 |
+
dtype: string
|
209 |
+
splits:
|
210 |
+
- name: train
|
211 |
+
num_bytes: {jsonl_file.stat().st_size}
|
212 |
+
num_examples: {sum(1 for _ in open(jsonl_file))}
|
213 |
+
download_size: {jsonl_file.stat().st_size}
|
214 |
+
dataset_size: {jsonl_file.stat().st_size}
|
215 |
+
---
|
216 |
+
|
217 |
+
# Voxtral ASR Dataset
|
218 |
+
|
219 |
+
This dataset was created using the Voxtral ASR Fine-tuning Interface.
|
220 |
+
|
221 |
+
## Dataset Structure
|
222 |
+
|
223 |
+
- **audio_path**: Path to the audio file
|
224 |
+
- **text**: Transcription of the audio
|
225 |
+
|
226 |
+
## Usage
|
227 |
+
|
228 |
+
```python
|
229 |
+
from datasets import load_dataset
|
230 |
+
|
231 |
+
dataset = load_dataset("{repo_name}")
|
232 |
+
```
|
233 |
+
"""
|
234 |
+
|
235 |
+
# Upload README
|
236 |
+
readme_path = jsonl_file.parent / "README.md"
|
237 |
+
with open(readme_path, "w") as f:
|
238 |
+
f.write(readme_content)
|
239 |
+
|
240 |
+
api.upload_file(
|
241 |
+
path_or_fileobj=str(readme_path),
|
242 |
+
path_in_repo="README.md",
|
243 |
+
repo_id=repo_name,
|
244 |
+
repo_type="dataset",
|
245 |
+
token=token
|
246 |
+
)
|
247 |
+
|
248 |
+
readme_path.unlink() # Clean up temp file
|
249 |
+
|
250 |
+
return f"β
Dataset pushed to: https://huggingface.co/datasets/{repo_name}"
|
251 |
+
|
252 |
+
except Exception as e:
|
253 |
+
return f"β Failed to push dataset: {e}"
|
254 |
+
|
255 |
+
|
256 |
def _save_recordings(recordings: list[tuple[int, list]], transcripts: list[str]) -> str:
|
257 |
import soundfile as sf
|
258 |
dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
|
|
|
329 |
repo_name = f"{username}/{repo_short}" if username else repo_short
|
330 |
push_args = [
|
331 |
str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
|
332 |
+
"model",
|
333 |
str(output_dir),
|
334 |
repo_name,
|
335 |
]
|
|
|
618 |
gr.update(visible=True), # dataset_status
|
619 |
gr.update(visible=True), # advanced_accordion
|
620 |
gr.update(visible=True), # save_rec_btn
|
621 |
+
gr.update(visible=True), # push_recordings_btn
|
622 |
gr.update(visible=True), # start_btn
|
623 |
gr.update(visible=True), # logs_box
|
624 |
]
|
|
|
707 |
gr.Markdown("### Upload audio + transcripts (optional)")
|
708 |
upload_audio = gr.File(file_count="multiple", type="filepath", label="Upload WAV/FLAC files (optional)")
|
709 |
transcripts_box = gr.Textbox(lines=6, label="Transcripts (one per line, aligned with files)")
|
710 |
+
dataset_repo_name = gr.Textbox(value=f"voxtral-dataset-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
711 |
+
label="Dataset repo name (will be pushed to HF Hub)")
|
712 |
save_upload_btn = gr.Button("Save uploaded dataset")
|
713 |
+
push_dataset_btn = gr.Button("Push dataset to HF Hub")
|
714 |
|
715 |
def _collect_upload(files, txt):
|
716 |
lines = [s.strip() for s in (txt or "").splitlines() if s.strip()]
|
717 |
+
jsonl_path = _save_uploaded_dataset(files or [], lines)
|
718 |
+
return f"β
Dataset saved locally: {jsonl_path}"
|
719 |
|
720 |
+
def _push_dataset_handler(repo_name):
|
721 |
+
if not jsonl_path_state.value:
|
722 |
+
return "β No dataset saved yet. Please save dataset first."
|
723 |
+
return _push_dataset_to_hub(jsonl_path_state.value, repo_name)
|
724 |
+
|
725 |
+
save_upload_btn.click(_collect_upload, [upload_audio, transcripts_box], [jsonl_path_state])
|
726 |
+
push_dataset_btn.click(_push_dataset_handler, [dataset_repo_name], [jsonl_path_state])
|
727 |
|
728 |
# Save recordings button
|
729 |
save_rec_btn = gr.Button("Save recordings as dataset", visible=False)
|
730 |
+
push_recordings_btn = gr.Button("Push recordings dataset to HF Hub", visible=False)
|
731 |
|
732 |
def _collect_preloaded_recs(*recs_and_texts):
|
733 |
import soundfile as sf
|
|
|
756 |
|
757 |
save_rec_btn.click(_collect_preloaded_recs, rec_components + [phrase_texts_state], [jsonl_path_state])
|
758 |
|
759 |
+
def _push_recordings_handler(repo_name):
|
760 |
+
if not jsonl_path_state.value:
|
761 |
+
return "β No recordings dataset saved yet. Please save recordings first."
|
762 |
+
return _push_dataset_to_hub(jsonl_path_state.value, repo_name)
|
763 |
+
|
764 |
+
push_recordings_btn.click(_push_recordings_handler, [dataset_repo_name], [jsonl_path_state])
|
765 |
+
|
766 |
# Removed multilingual dataset sample section - phrases are now loaded automatically when language is selected
|
767 |
|
768 |
start_btn = gr.Button("Start Fine-tuning", visible=False)
|
requirements.txt
CHANGED
@@ -2,4 +2,6 @@ torch
|
|
2 |
datasets
|
3 |
peft
|
4 |
transformers
|
5 |
-
gradio
|
|
|
|
|
|
2 |
datasets
|
3 |
peft
|
4 |
transformers
|
5 |
+
gradio
|
6 |
+
trackio
|
7 |
+
huggingface_hub
|
scripts/push_to_huggingface.py
CHANGED
@@ -1,20 +1,26 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
Push Trained
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"""
|
6 |
|
7 |
import os
|
8 |
import json
|
9 |
import argparse
|
10 |
import logging
|
11 |
-
import time
|
12 |
from pathlib import Path
|
13 |
-
from typing import Dict, Any, Optional
|
14 |
from datetime import datetime
|
15 |
-
import subprocess
|
16 |
-
import shutil
|
17 |
-
import platform
|
18 |
|
19 |
# Set timeout for HF operations to prevent hanging
|
20 |
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
|
@@ -22,34 +28,15 @@ os.environ['HF_HUB_UPLOAD_TIMEOUT'] = '600'
|
|
22 |
|
23 |
try:
|
24 |
from huggingface_hub import HfApi, create_repo, upload_file
|
25 |
-
from huggingface_hub import snapshot_download, hf_hub_download
|
26 |
HF_AVAILABLE = True
|
27 |
except ImportError:
|
28 |
HF_AVAILABLE = False
|
29 |
print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
|
30 |
|
31 |
-
try:
|
32 |
-
import sys
|
33 |
-
import os
|
34 |
-
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
|
35 |
-
from monitoring import SmolLM3Monitor
|
36 |
-
MONITORING_AVAILABLE = True
|
37 |
-
except ImportError:
|
38 |
-
MONITORING_AVAILABLE = False
|
39 |
-
print("Warning: monitoring module not available")
|
40 |
-
|
41 |
logger = logging.getLogger(__name__)
|
42 |
|
43 |
-
class TimeoutError(Exception):
|
44 |
-
"""Custom timeout exception"""
|
45 |
-
pass
|
46 |
-
|
47 |
-
def timeout_handler(signum, frame):
|
48 |
-
"""Signal handler for timeout"""
|
49 |
-
raise TimeoutError("Operation timed out")
|
50 |
-
|
51 |
class HuggingFacePusher:
|
52 |
-
"""Push trained models
|
53 |
|
54 |
def __init__(
|
55 |
self,
|
@@ -57,44 +44,22 @@ class HuggingFacePusher:
|
|
57 |
repo_name: str,
|
58 |
token: Optional[str] = None,
|
59 |
private: bool = False,
|
60 |
-
trackio_url: Optional[str] = None,
|
61 |
-
experiment_name: Optional[str] = None,
|
62 |
-
dataset_repo: Optional[str] = None,
|
63 |
-
hf_token: Optional[str] = None,
|
64 |
author_name: Optional[str] = None,
|
65 |
model_description: Optional[str] = None,
|
66 |
-
training_config_type: Optional[str] = None,
|
67 |
model_name: Optional[str] = None,
|
68 |
-
dataset_name: Optional[str] = None
|
69 |
-
batch_size: Optional[str] = None,
|
70 |
-
learning_rate: Optional[str] = None,
|
71 |
-
max_epochs: Optional[str] = None,
|
72 |
-
max_seq_length: Optional[str] = None,
|
73 |
-
trainer_type: Optional[str] = None
|
74 |
):
|
75 |
self.model_path = Path(model_path)
|
76 |
# Original user input (may be just the repo name without username)
|
77 |
self.repo_name = repo_name
|
78 |
-
self.token = token or
|
79 |
self.private = private
|
80 |
-
self.trackio_url = trackio_url
|
81 |
-
self.experiment_name = experiment_name
|
82 |
self.author_name = author_name
|
83 |
self.model_description = model_description
|
84 |
-
|
85 |
-
#
|
86 |
-
self.
|
87 |
-
self.model_name = model_name
|
88 |
self.dataset_name = dataset_name
|
89 |
-
self.batch_size = batch_size
|
90 |
-
self.learning_rate = learning_rate
|
91 |
-
self.max_epochs = max_epochs
|
92 |
-
self.max_seq_length = max_seq_length
|
93 |
-
self.trainer_type = trainer_type
|
94 |
-
|
95 |
-
# HF Datasets configuration
|
96 |
-
self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
97 |
-
self.hf_token = hf_token or os.getenv('HF_TOKEN')
|
98 |
|
99 |
# Initialize HF API
|
100 |
if HF_AVAILABLE:
|
@@ -105,19 +70,7 @@ class HuggingFacePusher:
|
|
105 |
# Resolve the full repo id (username/repo) if user only provided repo name
|
106 |
self.repo_id = self._resolve_repo_id(self.repo_name)
|
107 |
|
108 |
-
# Initialize monitoring if available
|
109 |
-
self.monitor = None
|
110 |
-
if MONITORING_AVAILABLE:
|
111 |
-
self.monitor = SmolLM3Monitor(
|
112 |
-
experiment_name=experiment_name or "model_push",
|
113 |
-
trackio_url=trackio_url,
|
114 |
-
enable_tracking=bool(trackio_url),
|
115 |
-
hf_token=self.hf_token,
|
116 |
-
dataset_repo=self.dataset_repo
|
117 |
-
)
|
118 |
-
|
119 |
logger.info(f"Initialized HuggingFacePusher for {self.repo_id}")
|
120 |
-
logger.info(f"Dataset repository: {self.dataset_repo}")
|
121 |
|
122 |
def _resolve_repo_id(self, repo_name: str) -> str:
|
123 |
"""Return a fully-qualified repo id in the form username/repo.
|
@@ -515,59 +468,33 @@ MIT License
|
|
515 |
logger.error(f"β Failed to create README: {e}")
|
516 |
return False
|
517 |
|
518 |
-
|
519 |
-
|
520 |
-
if self.monitor:
|
521 |
-
try:
|
522 |
-
# Log to Trackio
|
523 |
-
self.monitor.log_metrics({
|
524 |
-
"push_action": action,
|
525 |
-
"repo_name": self.repo_id,
|
526 |
-
"model_size_gb": self._get_model_size(),
|
527 |
-
"dataset_repo": self.dataset_repo,
|
528 |
-
**details
|
529 |
-
})
|
530 |
-
|
531 |
-
# Log training summary
|
532 |
-
self.monitor.log_training_summary({
|
533 |
-
"model_push": True,
|
534 |
-
"model_repo": self.repo_id,
|
535 |
-
"dataset_repo": self.dataset_repo,
|
536 |
-
"push_date": datetime.now().isoformat(),
|
537 |
-
**details
|
538 |
-
})
|
539 |
-
|
540 |
-
logger.info(f"β
Logged {action} to Trackio and HF Datasets")
|
541 |
-
except Exception as e:
|
542 |
-
logger.error(f"β Failed to log to Trackio: {e}")
|
543 |
-
|
544 |
-
def push_model(self, training_config: Optional[Dict[str, Any]] = None,
|
545 |
results: Optional[Dict[str, Any]] = None) -> bool:
|
546 |
-
"""Complete model push process
|
547 |
logger.info(f"π Starting model push to {self.repo_id}")
|
548 |
-
|
549 |
-
|
550 |
# Validate model path
|
551 |
if not self.validate_model_path():
|
552 |
return False
|
553 |
-
|
554 |
# Create repository
|
555 |
if not self.create_repository():
|
556 |
return False
|
557 |
-
|
558 |
# Load training config and results if not provided
|
559 |
if training_config is None:
|
560 |
training_config = self._load_training_config()
|
561 |
-
|
562 |
if results is None:
|
563 |
results = self._load_training_results()
|
564 |
-
|
565 |
# Create and upload model card
|
566 |
model_card = self.create_model_card(training_config, results)
|
567 |
model_card_path = Path("temp_model_card.md")
|
568 |
with open(model_card_path, "w") as f:
|
569 |
f.write(model_card)
|
570 |
-
|
571 |
try:
|
572 |
upload_file(
|
573 |
path_or_fileobj=str(model_card_path),
|
@@ -577,27 +504,135 @@ MIT License
|
|
577 |
)
|
578 |
finally:
|
579 |
model_card_path.unlink()
|
580 |
-
|
581 |
# Upload model files
|
582 |
if not self.upload_model_files():
|
583 |
return False
|
584 |
-
|
585 |
# Upload training results
|
586 |
if results:
|
587 |
self.upload_training_results(str(self.model_path))
|
588 |
-
|
589 |
-
# Log
|
590 |
-
|
591 |
-
"model_path": str(self.model_path),
|
592 |
-
"repo_name": self.repo_name,
|
593 |
-
"private": self.private,
|
594 |
-
"training_config": training_config,
|
595 |
-
"results": results
|
596 |
-
})
|
597 |
-
|
598 |
logger.info(f"π Model successfully pushed to: https://huggingface.co/{self.repo_id}")
|
599 |
-
|
600 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
601 |
|
602 |
def _load_training_config(self) -> Dict[str, Any]:
|
603 |
"""Load training configuration"""
|
@@ -619,81 +654,94 @@ def parse_args():
|
|
619 |
"""Parse command line arguments"""
|
620 |
parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')
|
621 |
|
622 |
-
#
|
623 |
-
parser.
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type')
|
643 |
|
644 |
return parser.parse_args()
|
645 |
|
646 |
def main():
|
647 |
"""Main function"""
|
648 |
args = parse_args()
|
649 |
-
|
650 |
# Setup logging
|
651 |
logging.basicConfig(
|
652 |
level=logging.INFO,
|
653 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
654 |
)
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
|
|
659 |
try:
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
logger.info(
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
693 |
except Exception as e:
|
694 |
-
logger.error(f"β Error during
|
695 |
return 1
|
696 |
-
|
697 |
return 0
|
698 |
|
699 |
if __name__ == "__main__":
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
Push Trained Models and Datasets to Hugging Face Hub
|
4 |
+
|
5 |
+
Usage:
|
6 |
+
# Push a trained model
|
7 |
+
python push_to_huggingface.py model /path/to/model my-model-repo
|
8 |
+
|
9 |
+
# Push a dataset
|
10 |
+
python push_to_huggingface.py dataset /path/to/dataset.jsonl my-dataset-repo
|
11 |
+
|
12 |
+
Authentication:
|
13 |
+
Set HF_TOKEN environment variable or use --token:
|
14 |
+
export HF_TOKEN=your_token_here
|
15 |
"""
|
16 |
|
17 |
import os
|
18 |
import json
|
19 |
import argparse
|
20 |
import logging
|
|
|
21 |
from pathlib import Path
|
22 |
+
from typing import Dict, Any, Optional
|
23 |
from datetime import datetime
|
|
|
|
|
|
|
24 |
|
25 |
# Set timeout for HF operations to prevent hanging
|
26 |
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
|
|
|
28 |
|
29 |
try:
|
30 |
from huggingface_hub import HfApi, create_repo, upload_file
|
|
|
31 |
HF_AVAILABLE = True
|
32 |
except ImportError:
|
33 |
HF_AVAILABLE = False
|
34 |
print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
logger = logging.getLogger(__name__)
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
class HuggingFacePusher:
|
39 |
+
"""Push trained models to Hugging Face Hub"""
|
40 |
|
41 |
def __init__(
|
42 |
self,
|
|
|
44 |
repo_name: str,
|
45 |
token: Optional[str] = None,
|
46 |
private: bool = False,
|
|
|
|
|
|
|
|
|
47 |
author_name: Optional[str] = None,
|
48 |
model_description: Optional[str] = None,
|
|
|
49 |
model_name: Optional[str] = None,
|
50 |
+
dataset_name: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
51 |
):
|
52 |
self.model_path = Path(model_path)
|
53 |
# Original user input (may be just the repo name without username)
|
54 |
self.repo_name = repo_name
|
55 |
+
self.token = token or os.getenv('HF_TOKEN')
|
56 |
self.private = private
|
|
|
|
|
57 |
self.author_name = author_name
|
58 |
self.model_description = model_description
|
59 |
+
|
60 |
+
# Model card generation details
|
61 |
+
self.model_name = model_name
|
|
|
62 |
self.dataset_name = dataset_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
# Initialize HF API
|
65 |
if HF_AVAILABLE:
|
|
|
70 |
# Resolve the full repo id (username/repo) if user only provided repo name
|
71 |
self.repo_id = self._resolve_repo_id(self.repo_name)
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
logger.info(f"Initialized HuggingFacePusher for {self.repo_id}")
|
|
|
74 |
|
75 |
def _resolve_repo_id(self, repo_name: str) -> str:
|
76 |
"""Return a fully-qualified repo id in the form username/repo.
|
|
|
468 |
logger.error(f"β Failed to create README: {e}")
|
469 |
return False
|
470 |
|
471 |
+
|
472 |
+
def push_model(self, training_config: Optional[Dict[str, Any]] = None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
results: Optional[Dict[str, Any]] = None) -> bool:
|
474 |
+
"""Complete model push process"""
|
475 |
logger.info(f"π Starting model push to {self.repo_id}")
|
476 |
+
|
|
|
477 |
# Validate model path
|
478 |
if not self.validate_model_path():
|
479 |
return False
|
480 |
+
|
481 |
# Create repository
|
482 |
if not self.create_repository():
|
483 |
return False
|
484 |
+
|
485 |
# Load training config and results if not provided
|
486 |
if training_config is None:
|
487 |
training_config = self._load_training_config()
|
488 |
+
|
489 |
if results is None:
|
490 |
results = self._load_training_results()
|
491 |
+
|
492 |
# Create and upload model card
|
493 |
model_card = self.create_model_card(training_config, results)
|
494 |
model_card_path = Path("temp_model_card.md")
|
495 |
with open(model_card_path, "w") as f:
|
496 |
f.write(model_card)
|
497 |
+
|
498 |
try:
|
499 |
upload_file(
|
500 |
path_or_fileobj=str(model_card_path),
|
|
|
504 |
)
|
505 |
finally:
|
506 |
model_card_path.unlink()
|
507 |
+
|
508 |
# Upload model files
|
509 |
if not self.upload_model_files():
|
510 |
return False
|
511 |
+
|
512 |
# Upload training results
|
513 |
if results:
|
514 |
self.upload_training_results(str(self.model_path))
|
515 |
+
|
516 |
+
# Log success
|
517 |
+
logger.info(f"β
Model successfully pushed to {self.repo_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
518 |
logger.info(f"π Model successfully pushed to: https://huggingface.co/{self.repo_id}")
|
519 |
+
|
520 |
return True
|
521 |
+
|
522 |
+
def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool:
|
523 |
+
"""Push dataset to Hugging Face Hub"""
|
524 |
+
logger.info(f"π Starting dataset push to {dataset_repo_name}")
|
525 |
+
|
526 |
+
try:
|
527 |
+
from huggingface_hub import create_repo
|
528 |
+
import json
|
529 |
+
|
530 |
+
# Determine full dataset repo name
|
531 |
+
if "/" not in dataset_repo_name:
|
532 |
+
dataset_repo_name = f"{self.repo_id.split('/')[0]}/{dataset_repo_name}"
|
533 |
+
|
534 |
+
# Create dataset repository
|
535 |
+
try:
|
536 |
+
create_repo(dataset_repo_name, repo_type="dataset", token=self.token, exist_ok=True)
|
537 |
+
logger.info(f"β
Created dataset repository: {dataset_repo_name}")
|
538 |
+
except Exception as e:
|
539 |
+
if "already exists" not in str(e).lower():
|
540 |
+
logger.error(f"β Failed to create dataset repo: {e}")
|
541 |
+
return False
|
542 |
+
logger.info(f"π Dataset repository already exists: {dataset_repo_name}")
|
543 |
+
|
544 |
+
# Read the dataset file
|
545 |
+
dataset_file = Path(dataset_path)
|
546 |
+
if not dataset_file.exists():
|
547 |
+
logger.error(f"β Dataset file not found: {dataset_path}")
|
548 |
+
return False
|
549 |
+
|
550 |
+
# Count lines for metadata
|
551 |
+
with open(dataset_file, 'r', encoding='utf-8') as f:
|
552 |
+
num_examples = sum(1 for _ in f)
|
553 |
+
|
554 |
+
file_size = dataset_file.stat().st_size
|
555 |
+
|
556 |
+
# Upload the dataset file
|
557 |
+
upload_file(
|
558 |
+
path_or_fileobj=str(dataset_file),
|
559 |
+
path_in_repo="data.jsonl",
|
560 |
+
repo_id=dataset_repo_name,
|
561 |
+
repo_type="dataset",
|
562 |
+
token=self.token
|
563 |
+
)
|
564 |
+
logger.info(f"β
Uploaded dataset file: {dataset_file.name}")
|
565 |
+
|
566 |
+
# Create a dataset README
|
567 |
+
readme_content = f"""---
|
568 |
+
dataset_info:
|
569 |
+
features:
|
570 |
+
- name: audio_path
|
571 |
+
dtype: string
|
572 |
+
- name: text
|
573 |
+
dtype: string
|
574 |
+
splits:
|
575 |
+
- name: train
|
576 |
+
num_bytes: {file_size}
|
577 |
+
num_examples: {num_examples}
|
578 |
+
download_size: {file_size}
|
579 |
+
dataset_size: {file_size}
|
580 |
+
tags:
|
581 |
+
- voxtral
|
582 |
+
- asr
|
583 |
+
- fine-tuning
|
584 |
+
- conversational
|
585 |
+
- speech-to-text
|
586 |
+
- audio-to-text
|
587 |
+
- tonic
|
588 |
+
---
|
589 |
+
|
590 |
+
# Voxtral ASR Dataset
|
591 |
+
|
592 |
+
This dataset was created for fine-tuning Voxtral ASR models.
|
593 |
+
|
594 |
+
## Dataset Structure
|
595 |
+
|
596 |
+
- **audio_path**: Path to the audio file
|
597 |
+
- **text**: Transcription of the audio
|
598 |
+
|
599 |
+
## Statistics
|
600 |
+
|
601 |
+
- Number of examples: {num_examples}
|
602 |
+
- File size: {file_size} bytes
|
603 |
+
|
604 |
+
## Usage
|
605 |
+
|
606 |
+
```python
|
607 |
+
from datasets import load_dataset
|
608 |
+
|
609 |
+
dataset = load_dataset("{dataset_repo_name}")
|
610 |
+
```
|
611 |
+
"""
|
612 |
+
|
613 |
+
# Upload README
|
614 |
+
readme_path = dataset_file.parent / "README.md"
|
615 |
+
with open(readme_path, "w") as f:
|
616 |
+
f.write(readme_content)
|
617 |
+
|
618 |
+
upload_file(
|
619 |
+
path_or_fileobj=str(readme_path),
|
620 |
+
path_in_repo="README.md",
|
621 |
+
repo_id=dataset_repo_name,
|
622 |
+
repo_type="dataset",
|
623 |
+
token=self.token
|
624 |
+
)
|
625 |
+
|
626 |
+
readme_path.unlink() # Clean up temp file
|
627 |
+
|
628 |
+
logger.info(f"β
Dataset README uploaded")
|
629 |
+
logger.info(f"π Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}")
|
630 |
+
|
631 |
+
return True
|
632 |
+
|
633 |
+
except Exception as e:
|
634 |
+
logger.error(f"β Failed to push dataset: {e}")
|
635 |
+
return False
|
636 |
|
637 |
def _load_training_config(self) -> Dict[str, Any]:
|
638 |
"""Load training configuration"""
|
|
|
654 |
"""Parse command line arguments"""
|
655 |
parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')
|
656 |
|
657 |
+
# Subcommands
|
658 |
+
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
659 |
+
|
660 |
+
# Model push subcommand
|
661 |
+
model_parser = subparsers.add_parser('model', help='Push trained model to Hugging Face Hub')
|
662 |
+
model_parser.add_argument('model_path', type=str, help='Path to trained model directory')
|
663 |
+
model_parser.add_argument('repo_name', type=str, help='Hugging Face repository name (repo-name). Username will be auto-detected from your token.')
|
664 |
+
model_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
|
665 |
+
model_parser.add_argument('--private', action='store_true', help='Make repository private')
|
666 |
+
model_parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
|
667 |
+
model_parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
|
668 |
+
model_parser.add_argument('--model-name', type=str, default=None, help='Base model name')
|
669 |
+
model_parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
|
670 |
+
|
671 |
+
# Dataset push subcommand
|
672 |
+
dataset_parser = subparsers.add_parser('dataset', help='Push dataset to Hugging Face Hub')
|
673 |
+
dataset_parser.add_argument('dataset_path', type=str, help='Path to dataset JSONL file')
|
674 |
+
dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name')
|
675 |
+
dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
|
676 |
+
dataset_parser.add_argument('--private', action='store_true', help='Make repository private')
|
|
|
677 |
|
678 |
return parser.parse_args()
|
679 |
|
680 |
def main():
|
681 |
"""Main function"""
|
682 |
args = parse_args()
|
683 |
+
|
684 |
# Setup logging
|
685 |
logging.basicConfig(
|
686 |
level=logging.INFO,
|
687 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
688 |
)
|
689 |
+
|
690 |
+
if not args.command:
|
691 |
+
logger.error("β No command specified. Use 'model' or 'dataset' subcommand.")
|
692 |
+
return 1
|
693 |
+
|
694 |
try:
|
695 |
+
if args.command == 'model':
|
696 |
+
logger.info("Starting model push to Hugging Face Hub")
|
697 |
+
|
698 |
+
# Initialize pusher
|
699 |
+
pusher = HuggingFacePusher(
|
700 |
+
model_path=args.model_path,
|
701 |
+
repo_name=args.repo_name,
|
702 |
+
token=args.token,
|
703 |
+
private=args.private,
|
704 |
+
author_name=args.author_name,
|
705 |
+
model_description=args.model_description,
|
706 |
+
model_name=args.model_name,
|
707 |
+
dataset_name=args.dataset_name
|
708 |
+
)
|
709 |
+
|
710 |
+
# Push model
|
711 |
+
success = pusher.push_model()
|
712 |
+
|
713 |
+
if success:
|
714 |
+
logger.info("β
Model push completed successfully!")
|
715 |
+
logger.info(f"π View your model at: https://huggingface.co/{args.repo_name}")
|
716 |
+
else:
|
717 |
+
logger.error("β Model push failed!")
|
718 |
+
return 1
|
719 |
+
|
720 |
+
elif args.command == 'dataset':
|
721 |
+
logger.info("Starting dataset push to Hugging Face Hub")
|
722 |
+
|
723 |
+
# Initialize pusher for dataset
|
724 |
+
pusher = HuggingFacePusher(
|
725 |
+
model_path="", # Not needed for dataset push
|
726 |
+
repo_name=args.repo_name,
|
727 |
+
token=args.token,
|
728 |
+
private=args.private
|
729 |
+
)
|
730 |
+
|
731 |
+
# Push dataset
|
732 |
+
success = pusher.push_dataset(args.dataset_path, args.repo_name)
|
733 |
+
|
734 |
+
if success:
|
735 |
+
logger.info("β
Dataset push completed successfully!")
|
736 |
+
logger.info(f"π View your dataset at: https://huggingface.co/datasets/{args.repo_name}")
|
737 |
+
else:
|
738 |
+
logger.error("β Dataset push failed!")
|
739 |
+
return 1
|
740 |
+
|
741 |
except Exception as e:
|
742 |
+
logger.error(f"β Error during push: {e}")
|
743 |
return 1
|
744 |
+
|
745 |
return 0
|
746 |
|
747 |
if __name__ == "__main__":
|
scripts/train.py
CHANGED
@@ -1,8 +1,31 @@
|
|
1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
import argparse
|
4 |
import json
|
5 |
from pathlib import Path
|
|
|
|
|
6 |
import torch
|
7 |
from datasets import load_dataset, Audio, Dataset
|
8 |
from transformers import (
|
@@ -11,6 +34,85 @@ from transformers import (
|
|
11 |
Trainer,
|
12 |
TrainingArguments,
|
13 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
class VoxtralDataCollator:
|
@@ -161,6 +263,12 @@ def main():
|
|
161 |
parser.add_argument("--epochs", type=float, default=3)
|
162 |
parser.add_argument("--logging-steps", type=int, default=10)
|
163 |
parser.add_argument("--save-steps", type=int, default=50)
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
args = parser.parse_args()
|
165 |
|
166 |
model_checkpoint = args.model_checkpoint
|
@@ -169,6 +277,48 @@ def main():
|
|
169 |
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
170 |
print(f"Using device: {torch_device}")
|
171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
print("Loading processor and model...")
|
173 |
processor = VoxtralProcessor.from_pretrained(model_checkpoint)
|
174 |
model = VoxtralForConditionalGeneration.from_pretrained(
|
@@ -200,7 +350,7 @@ def main():
|
|
200 |
save_steps=args.save_steps,
|
201 |
eval_strategy="steps" if eval_dataset else "no",
|
202 |
save_strategy="steps",
|
203 |
-
report_to="
|
204 |
remove_unused_columns=False,
|
205 |
dataloader_num_workers=1,
|
206 |
)
|
@@ -223,6 +373,44 @@ def main():
|
|
223 |
if eval_dataset:
|
224 |
results = trainer.evaluate()
|
225 |
print(f"Final evaluation results: {results}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
print("Training completed successfully!")
|
228 |
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Voxtral ASR Full Fine-tuning Script with Trackio Integration
|
4 |
+
|
5 |
+
This script fine-tunes Voxtral models for ASR tasks with automatic experiment tracking
|
6 |
+
via Trackio and Hugging Face Spaces.
|
7 |
+
|
8 |
+
Features:
|
9 |
+
- Automatic username detection from HF_TOKEN environment variable
|
10 |
+
- Auto-generated space names with timestamps
|
11 |
+
- Local-only mode when no HF_TOKEN is set
|
12 |
+
- Comprehensive experiment logging
|
13 |
+
- Optional dataset pushing to Hugging Face Hub
|
14 |
+
|
15 |
+
Authentication:
|
16 |
+
Set HF_TOKEN environment variable to enable automatic space creation:
|
17 |
+
Linux/Mac: export HF_TOKEN=your_token_here
|
18 |
+
Windows: set HF_TOKEN=your_token_here
|
19 |
+
Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
|
20 |
+
|
21 |
+
Get your token from: https://huggingface.co/settings/tokens
|
22 |
+
"""
|
23 |
|
24 |
import argparse
|
25 |
import json
|
26 |
from pathlib import Path
|
27 |
+
from datetime import datetime
|
28 |
+
from typing import Tuple, Optional
|
29 |
import torch
|
30 |
from datasets import load_dataset, Audio, Dataset
|
31 |
from transformers import (
|
|
|
34 |
Trainer,
|
35 |
TrainingArguments,
|
36 |
)
|
37 |
+
from huggingface_hub import HfApi
|
38 |
+
import trackio
|
39 |
+
|
40 |
+
|
41 |
+
def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
|
42 |
+
"""
|
43 |
+
Validate a Hugging Face token and return the username.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
token (str): The Hugging Face token to validate
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
Tuple[bool, Optional[str], Optional[str]]:
|
50 |
+
- success: True if token is valid, False otherwise
|
51 |
+
- username: The username associated with the token (if valid)
|
52 |
+
- error_message: Error message if validation failed
|
53 |
+
"""
|
54 |
+
try:
|
55 |
+
# Create API client with token directly
|
56 |
+
api = HfApi(token=token)
|
57 |
+
|
58 |
+
# Try to get user info - this will fail if token is invalid
|
59 |
+
user_info = api.whoami()
|
60 |
+
|
61 |
+
# Extract username from user info
|
62 |
+
username = user_info.get("name", user_info.get("username"))
|
63 |
+
|
64 |
+
if not username:
|
65 |
+
return False, None, "Could not retrieve username from token"
|
66 |
+
|
67 |
+
return True, username, None
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
error_msg = str(e)
|
71 |
+
if "401" in error_msg or "unauthorized" in error_msg.lower():
|
72 |
+
return False, None, "Invalid token - unauthorized access"
|
73 |
+
elif "403" in error_msg:
|
74 |
+
return False, None, "Token lacks required permissions"
|
75 |
+
elif "network" in error_msg.lower() or "connection" in error_msg.lower():
|
76 |
+
return False, None, f"Network error: {error_msg}"
|
77 |
+
else:
|
78 |
+
return False, None, f"Validation error: {error_msg}"
|
79 |
+
|
80 |
+
|
81 |
+
def get_default_space_name(project_type: str = "voxtral-asr-finetuning") -> str:
|
82 |
+
"""
|
83 |
+
Generate a default space name with username and timestamp.
|
84 |
+
|
85 |
+
Args:
|
86 |
+
project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
|
87 |
+
|
88 |
+
Returns:
|
89 |
+
str: Default space name in format "username/project-type-timestamp"
|
90 |
+
"""
|
91 |
+
try:
|
92 |
+
# Get token from environment variables only
|
93 |
+
import os
|
94 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
95 |
+
|
96 |
+
if not token:
|
97 |
+
print("Warning: No HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable found.")
|
98 |
+
print("Set HF_TOKEN environment variable to enable automatic space creation.")
|
99 |
+
print("Example: export HF_TOKEN=your_token_here")
|
100 |
+
print("Falling back to local-only mode.")
|
101 |
+
return None
|
102 |
+
|
103 |
+
# Validate token and get username
|
104 |
+
success, username, error = validate_hf_token(token)
|
105 |
+
if success and username:
|
106 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
107 |
+
return f"{username}/{project_type}-{timestamp}"
|
108 |
+
else:
|
109 |
+
print(f"Warning: Token validation failed: {error}")
|
110 |
+
print("Falling back to local-only mode.")
|
111 |
+
return None
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
print(f"Warning: Failed to generate default space name: {e}")
|
115 |
+
return None
|
116 |
|
117 |
|
118 |
class VoxtralDataCollator:
|
|
|
263 |
parser.add_argument("--epochs", type=float, default=3)
|
264 |
parser.add_argument("--logging-steps", type=int, default=10)
|
265 |
parser.add_argument("--save-steps", type=int, default=50)
|
266 |
+
parser.add_argument("--trackio-space", type=str, default=None,
|
267 |
+
help="Hugging Face Space ID for trackio logging (format: username/space-name). If not provided, will auto-generate based on HF token")
|
268 |
+
parser.add_argument("--push-dataset", action="store_true",
|
269 |
+
help="Push the training dataset to Hugging Face Hub after training")
|
270 |
+
parser.add_argument("--dataset-repo", type=str, default=None,
|
271 |
+
help="Dataset repository name for pushing dataset (format: username/dataset-name)")
|
272 |
args = parser.parse_args()
|
273 |
|
274 |
model_checkpoint = args.model_checkpoint
|
|
|
277 |
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
278 |
print(f"Using device: {torch_device}")
|
279 |
|
280 |
+
# Determine trackio space
|
281 |
+
trackio_space = args.trackio_space
|
282 |
+
if not trackio_space:
|
283 |
+
trackio_space = get_default_space_name("voxtral-asr-finetuning")
|
284 |
+
|
285 |
+
# Initialize trackio for experiment tracking
|
286 |
+
if trackio_space:
|
287 |
+
print(f"Initializing trackio with space: {trackio_space}")
|
288 |
+
trackio.init(
|
289 |
+
project="voxtral-finetuning",
|
290 |
+
config={
|
291 |
+
"model_checkpoint": model_checkpoint,
|
292 |
+
"output_dir": output_dir,
|
293 |
+
"batch_size": args.batch_size,
|
294 |
+
"learning_rate": args.learning_rate,
|
295 |
+
"epochs": args.epochs,
|
296 |
+
"train_count": args.train_count,
|
297 |
+
"eval_count": args.eval_count,
|
298 |
+
"dataset_jsonl": args.dataset_jsonl,
|
299 |
+
"dataset_name": args.dataset_name,
|
300 |
+
"dataset_config": args.dataset_config,
|
301 |
+
},
|
302 |
+
space_id=trackio_space
|
303 |
+
)
|
304 |
+
else:
|
305 |
+
print("Initializing trackio in local-only mode")
|
306 |
+
trackio.init(
|
307 |
+
project="voxtral-finetuning",
|
308 |
+
config={
|
309 |
+
"model_checkpoint": model_checkpoint,
|
310 |
+
"output_dir": output_dir,
|
311 |
+
"batch_size": args.batch_size,
|
312 |
+
"learning_rate": args.learning_rate,
|
313 |
+
"epochs": args.epochs,
|
314 |
+
"train_count": args.train_count,
|
315 |
+
"eval_count": args.eval_count,
|
316 |
+
"dataset_jsonl": args.dataset_jsonl,
|
317 |
+
"dataset_name": args.dataset_name,
|
318 |
+
"dataset_config": args.dataset_config,
|
319 |
+
}
|
320 |
+
)
|
321 |
+
|
322 |
print("Loading processor and model...")
|
323 |
processor = VoxtralProcessor.from_pretrained(model_checkpoint)
|
324 |
model = VoxtralForConditionalGeneration.from_pretrained(
|
|
|
350 |
save_steps=args.save_steps,
|
351 |
eval_strategy="steps" if eval_dataset else "no",
|
352 |
save_strategy="steps",
|
353 |
+
report_to=["trackio"],
|
354 |
remove_unused_columns=False,
|
355 |
dataloader_num_workers=1,
|
356 |
)
|
|
|
373 |
if eval_dataset:
|
374 |
results = trainer.evaluate()
|
375 |
print(f"Final evaluation results: {results}")
|
376 |
+
# Log final evaluation results
|
377 |
+
trackio.log(results)
|
378 |
+
|
379 |
+
# Push dataset to Hub if requested
|
380 |
+
if args.push_dataset and args.dataset_jsonl:
|
381 |
+
print("Pushing dataset to Hugging Face Hub...")
|
382 |
+
try:
|
383 |
+
from pathlib import Path
|
384 |
+
import subprocess
|
385 |
+
|
386 |
+
dataset_repo = args.dataset_repo
|
387 |
+
if not dataset_repo:
|
388 |
+
# Auto-generate dataset repo name
|
389 |
+
if trackio_space:
|
390 |
+
username = trackio_space.split('/')[0]
|
391 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
392 |
+
dataset_repo = f"{username}/voxtral-dataset-{timestamp}"
|
393 |
+
else:
|
394 |
+
print("Warning: Cannot auto-generate dataset repo name without HF token")
|
395 |
+
dataset_repo = f"voxtral-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
396 |
+
|
397 |
+
# Call the push script
|
398 |
+
push_cmd = [
|
399 |
+
"python", str(Path(__file__).parent / "push_to_huggingface.py"),
|
400 |
+
"dataset", args.dataset_jsonl, dataset_repo
|
401 |
+
]
|
402 |
+
|
403 |
+
result = subprocess.run(push_cmd, capture_output=True, text=True)
|
404 |
+
if result.returncode == 0:
|
405 |
+
print(f"β
Dataset pushed to: https://huggingface.co/datasets/{dataset_repo}")
|
406 |
+
else:
|
407 |
+
print(f"β Failed to push dataset: {result.stderr}")
|
408 |
+
|
409 |
+
except Exception as e:
|
410 |
+
print(f"β Error pushing dataset: {e}")
|
411 |
+
|
412 |
+
# Finish trackio logging
|
413 |
+
trackio.finish()
|
414 |
|
415 |
print("Training completed successfully!")
|
416 |
|
scripts/train_lora.py
CHANGED
@@ -1,8 +1,32 @@
|
|
1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
import argparse
|
4 |
import json
|
5 |
from pathlib import Path
|
|
|
|
|
6 |
import torch
|
7 |
from datasets import load_dataset, Audio, Dataset
|
8 |
from transformers import (
|
@@ -12,6 +36,85 @@ from transformers import (
|
|
12 |
TrainingArguments,
|
13 |
)
|
14 |
from peft import LoraConfig, get_peft_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
class VoxtralDataCollator:
|
@@ -163,6 +266,12 @@ def main():
|
|
163 |
parser.add_argument("--lora-alpha", type=int, default=32)
|
164 |
parser.add_argument("--lora-dropout", type=float, default=0.0)
|
165 |
parser.add_argument("--freeze-audio-tower", action="store_true", help="Freeze audio encoder parameters")
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
args = parser.parse_args()
|
167 |
|
168 |
model_checkpoint = args.model_checkpoint
|
@@ -171,6 +280,56 @@ def main():
|
|
171 |
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
172 |
print(f"Using device: {torch_device}")
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
print("Loading processor and model...")
|
175 |
processor = VoxtralProcessor.from_pretrained(model_checkpoint)
|
176 |
lora_cfg = LoraConfig(
|
@@ -210,12 +369,12 @@ def main():
|
|
210 |
learning_rate=args.learning_rate,
|
211 |
num_train_epochs=args.epochs,
|
212 |
bf16=True,
|
213 |
-
logging_steps=args.
|
214 |
eval_steps=args.save_steps if eval_dataset else None,
|
215 |
save_steps=args.save_steps,
|
216 |
eval_strategy="steps" if eval_dataset else "no",
|
217 |
save_strategy="steps",
|
218 |
-
report_to="
|
219 |
remove_unused_columns=False,
|
220 |
dataloader_num_workers=1,
|
221 |
)
|
@@ -238,6 +397,44 @@ def main():
|
|
238 |
if eval_dataset:
|
239 |
results = trainer.evaluate()
|
240 |
print(f"Final evaluation results: {results}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
print("Training completed successfully!")
|
243 |
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Voxtral ASR LoRA Fine-tuning Script with Trackio Integration
|
4 |
+
|
5 |
+
This script fine-tunes Voxtral models using LoRA for ASR tasks with automatic experiment tracking
|
6 |
+
via Trackio and Hugging Face Spaces.
|
7 |
+
|
8 |
+
Features:
|
9 |
+
- Automatic username detection from HF_TOKEN environment variable
|
10 |
+
- Auto-generated space names with timestamps
|
11 |
+
- Local-only mode when no HF_TOKEN is set
|
12 |
+
- Comprehensive experiment logging
|
13 |
+
- LoRA-specific hyperparameters tracking
|
14 |
+
- Optional dataset pushing to Hugging Face Hub
|
15 |
+
|
16 |
+
Authentication:
|
17 |
+
Set HF_TOKEN environment variable to enable automatic space creation:
|
18 |
+
Linux/Mac: export HF_TOKEN=your_token_here
|
19 |
+
Windows: set HF_TOKEN=your_token_here
|
20 |
+
Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
|
21 |
+
|
22 |
+
Get your token from: https://huggingface.co/settings/tokens
|
23 |
+
"""
|
24 |
|
25 |
import argparse
|
26 |
import json
|
27 |
from pathlib import Path
|
28 |
+
from datetime import datetime
|
29 |
+
from typing import Tuple, Optional
|
30 |
import torch
|
31 |
from datasets import load_dataset, Audio, Dataset
|
32 |
from transformers import (
|
|
|
36 |
TrainingArguments,
|
37 |
)
|
38 |
from peft import LoraConfig, get_peft_model
|
39 |
+
from huggingface_hub import HfApi
|
40 |
+
import trackio
|
41 |
+
|
42 |
+
|
43 |
+
def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
|
44 |
+
"""
|
45 |
+
Validate a Hugging Face token and return the username.
|
46 |
+
|
47 |
+
Args:
|
48 |
+
token (str): The Hugging Face token to validate
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
Tuple[bool, Optional[str], Optional[str]]:
|
52 |
+
- success: True if token is valid, False otherwise
|
53 |
+
- username: The username associated with the token (if valid)
|
54 |
+
- error_message: Error message if validation failed
|
55 |
+
"""
|
56 |
+
try:
|
57 |
+
# Create API client with token directly
|
58 |
+
api = HfApi(token=token)
|
59 |
+
|
60 |
+
# Try to get user info - this will fail if token is invalid
|
61 |
+
user_info = api.whoami()
|
62 |
+
|
63 |
+
# Extract username from user info
|
64 |
+
username = user_info.get("name", user_info.get("username"))
|
65 |
+
|
66 |
+
if not username:
|
67 |
+
return False, None, "Could not retrieve username from token"
|
68 |
+
|
69 |
+
return True, username, None
|
70 |
+
|
71 |
+
except Exception as e:
|
72 |
+
error_msg = str(e)
|
73 |
+
if "401" in error_msg or "unauthorized" in error_msg.lower():
|
74 |
+
return False, None, "Invalid token - unauthorized access"
|
75 |
+
elif "403" in error_msg:
|
76 |
+
return False, None, "Token lacks required permissions"
|
77 |
+
elif "network" in error_msg.lower() or "connection" in error_msg.lower():
|
78 |
+
return False, None, f"Network error: {error_msg}"
|
79 |
+
else:
|
80 |
+
return False, None, f"Validation error: {error_msg}"
|
81 |
+
|
82 |
+
|
83 |
+
def get_default_space_name(project_type: str = "voxtral-lora-finetuning") -> str:
|
84 |
+
"""
|
85 |
+
Generate a default space name with username and timestamp.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
str: Default space name in format "username/project-type-timestamp"
|
92 |
+
"""
|
93 |
+
try:
|
94 |
+
# Get token from environment variables only
|
95 |
+
import os
|
96 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
97 |
+
|
98 |
+
if not token:
|
99 |
+
print("Warning: No HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable found.")
|
100 |
+
print("Set HF_TOKEN environment variable to enable automatic space creation.")
|
101 |
+
print("Example: export HF_TOKEN=your_token_here")
|
102 |
+
print("Falling back to local-only mode.")
|
103 |
+
return None
|
104 |
+
|
105 |
+
# Validate token and get username
|
106 |
+
success, username, error = validate_hf_token(token)
|
107 |
+
if success and username:
|
108 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
109 |
+
return f"{username}/{project_type}-{timestamp}"
|
110 |
+
else:
|
111 |
+
print(f"Warning: Token validation failed: {error}")
|
112 |
+
print("Falling back to local-only mode.")
|
113 |
+
return None
|
114 |
+
|
115 |
+
except Exception as e:
|
116 |
+
print(f"Warning: Failed to generate default space name: {e}")
|
117 |
+
return None
|
118 |
|
119 |
|
120 |
class VoxtralDataCollator:
|
|
|
266 |
parser.add_argument("--lora-alpha", type=int, default=32)
|
267 |
parser.add_argument("--lora-dropout", type=float, default=0.0)
|
268 |
parser.add_argument("--freeze-audio-tower", action="store_true", help="Freeze audio encoder parameters")
|
269 |
+
parser.add_argument("--trackio-space", type=str, default=None,
|
270 |
+
help="Hugging Face Space ID for trackio logging (format: username/space-name). If not provided, will auto-generate based on HF token")
|
271 |
+
parser.add_argument("--push-dataset", action="store_true",
|
272 |
+
help="Push the training dataset to Hugging Face Hub after training")
|
273 |
+
parser.add_argument("--dataset-repo", type=str, default=None,
|
274 |
+
help="Dataset repository name for pushing dataset (format: username/dataset-name)")
|
275 |
args = parser.parse_args()
|
276 |
|
277 |
model_checkpoint = args.model_checkpoint
|
|
|
280 |
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
281 |
print(f"Using device: {torch_device}")
|
282 |
|
283 |
+
# Determine trackio space
|
284 |
+
trackio_space = args.trackio_space
|
285 |
+
if not trackio_space:
|
286 |
+
trackio_space = get_default_space_name("voxtral-lora-finetuning")
|
287 |
+
|
288 |
+
# Initialize trackio for experiment tracking
|
289 |
+
if trackio_space:
|
290 |
+
print(f"Initializing trackio with space: {trackio_space}")
|
291 |
+
trackio.init(
|
292 |
+
project="voxtral-lora-finetuning",
|
293 |
+
config={
|
294 |
+
"model_checkpoint": model_checkpoint,
|
295 |
+
"output_dir": output_dir,
|
296 |
+
"batch_size": args.batch_size,
|
297 |
+
"learning_rate": args.learning_rate,
|
298 |
+
"epochs": args.epochs,
|
299 |
+
"train_count": args.train_count,
|
300 |
+
"eval_count": args.eval_count,
|
301 |
+
"dataset_jsonl": args.dataset_jsonl,
|
302 |
+
"dataset_name": args.dataset_name,
|
303 |
+
"dataset_config": args.dataset_config,
|
304 |
+
"lora_r": args.lora_r,
|
305 |
+
"lora_alpha": args.lora_alpha,
|
306 |
+
"lora_dropout": args.lora_dropout,
|
307 |
+
"freeze_audio_tower": args.freeze_audio_tower,
|
308 |
+
},
|
309 |
+
space_id=trackio_space
|
310 |
+
)
|
311 |
+
else:
|
312 |
+
print("Initializing trackio in local-only mode")
|
313 |
+
trackio.init(
|
314 |
+
project="voxtral-lora-finetuning",
|
315 |
+
config={
|
316 |
+
"model_checkpoint": model_checkpoint,
|
317 |
+
"output_dir": output_dir,
|
318 |
+
"batch_size": args.batch_size,
|
319 |
+
"learning_rate": args.learning_rate,
|
320 |
+
"epochs": args.epochs,
|
321 |
+
"train_count": args.train_count,
|
322 |
+
"eval_count": args.eval_count,
|
323 |
+
"dataset_jsonl": args.dataset_jsonl,
|
324 |
+
"dataset_name": args.dataset_name,
|
325 |
+
"dataset_config": args.dataset_config,
|
326 |
+
"lora_r": args.lora_r,
|
327 |
+
"lora_alpha": args.lora_alpha,
|
328 |
+
"lora_dropout": args.lora_dropout,
|
329 |
+
"freeze_audio_tower": args.freeze_audio_tower,
|
330 |
+
}
|
331 |
+
)
|
332 |
+
|
333 |
print("Loading processor and model...")
|
334 |
processor = VoxtralProcessor.from_pretrained(model_checkpoint)
|
335 |
lora_cfg = LoraConfig(
|
|
|
369 |
learning_rate=args.learning_rate,
|
370 |
num_train_epochs=args.epochs,
|
371 |
bf16=True,
|
372 |
+
logging_steps=args.logging_steps,
|
373 |
eval_steps=args.save_steps if eval_dataset else None,
|
374 |
save_steps=args.save_steps,
|
375 |
eval_strategy="steps" if eval_dataset else "no",
|
376 |
save_strategy="steps",
|
377 |
+
report_to=["trackio"],
|
378 |
remove_unused_columns=False,
|
379 |
dataloader_num_workers=1,
|
380 |
)
|
|
|
397 |
if eval_dataset:
|
398 |
results = trainer.evaluate()
|
399 |
print(f"Final evaluation results: {results}")
|
400 |
+
# Log final evaluation results
|
401 |
+
trackio.log(results)
|
402 |
+
|
403 |
+
# Push dataset to Hub if requested
|
404 |
+
if args.push_dataset and args.dataset_jsonl:
|
405 |
+
print("Pushing dataset to Hugging Face Hub...")
|
406 |
+
try:
|
407 |
+
from pathlib import Path
|
408 |
+
import subprocess
|
409 |
+
|
410 |
+
dataset_repo = args.dataset_repo
|
411 |
+
if not dataset_repo:
|
412 |
+
# Auto-generate dataset repo name
|
413 |
+
if trackio_space:
|
414 |
+
username = trackio_space.split('/')[0]
|
415 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
416 |
+
dataset_repo = f"{username}/voxtral-dataset-{timestamp}"
|
417 |
+
else:
|
418 |
+
print("Warning: Cannot auto-generate dataset repo name without HF token")
|
419 |
+
dataset_repo = f"voxtral-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
420 |
+
|
421 |
+
# Call the push script
|
422 |
+
push_cmd = [
|
423 |
+
"python", str(Path(__file__).parent / "push_to_huggingface.py"),
|
424 |
+
"dataset", args.dataset_jsonl, dataset_repo
|
425 |
+
]
|
426 |
+
|
427 |
+
result = subprocess.run(push_cmd, capture_output=True, text=True)
|
428 |
+
if result.returncode == 0:
|
429 |
+
print(f"β
Dataset pushed to: https://huggingface.co/datasets/{dataset_repo}")
|
430 |
+
else:
|
431 |
+
print(f"β Failed to push dataset: {result.stderr}")
|
432 |
+
|
433 |
+
except Exception as e:
|
434 |
+
print(f"β Error pushing dataset: {e}")
|
435 |
+
|
436 |
+
# Finish trackio logging
|
437 |
+
trackio.finish()
|
438 |
|
439 |
print("Training completed successfully!")
|
440 |
|
templates/datasets/readme.md
DELETED
@@ -1,171 +0,0 @@
|
|
1 |
-
---
|
2 |
-
dataset_info:
|
3 |
-
features:
|
4 |
-
- name: experiment_id
|
5 |
-
dtype: string
|
6 |
-
- name: name
|
7 |
-
dtype: string
|
8 |
-
- name: description
|
9 |
-
dtype: string
|
10 |
-
- name: created_at
|
11 |
-
dtype: string
|
12 |
-
- name: status
|
13 |
-
dtype: string
|
14 |
-
- name: metrics
|
15 |
-
dtype: string
|
16 |
-
- name: parameters
|
17 |
-
dtype: string
|
18 |
-
- name: artifacts
|
19 |
-
dtype: string
|
20 |
-
- name: logs
|
21 |
-
dtype: string
|
22 |
-
- name: last_updated
|
23 |
-
dtype: string
|
24 |
-
splits:
|
25 |
-
- name: train
|
26 |
-
num_bytes: 4945
|
27 |
-
num_examples: 2
|
28 |
-
download_size: 15529
|
29 |
-
dataset_size: 4945
|
30 |
-
configs:
|
31 |
-
- config_name: default
|
32 |
-
data_files:
|
33 |
-
- split: train
|
34 |
-
path: data/train-*
|
35 |
-
tags:
|
36 |
-
- track tonic
|
37 |
-
- tonic
|
38 |
-
- experiment tracking
|
39 |
-
- smollm3
|
40 |
-
- fine-tuning
|
41 |
-
- legml
|
42 |
-
- hermes
|
43 |
-
---
|
44 |
-
|
45 |
-
# Trackio Experiments Dataset
|
46 |
-
|
47 |
-
This dataset stores experiment tracking data for ML training runs, particularly focused on SmolLM3 fine-tuning experiments with comprehensive metrics tracking.
|
48 |
-
|
49 |
-
## Dataset Structure
|
50 |
-
|
51 |
-
The dataset contains the following columns:
|
52 |
-
|
53 |
-
- **experiment_id**: Unique identifier for each experiment
|
54 |
-
- **name**: Human-readable name for the experiment
|
55 |
-
- **description**: Detailed description of the experiment
|
56 |
-
- **created_at**: Timestamp when the experiment was created
|
57 |
-
- **status**: Current status (running, completed, failed, paused)
|
58 |
-
- **metrics**: JSON string containing training metrics over time
|
59 |
-
- **parameters**: JSON string containing experiment configuration
|
60 |
-
- **artifacts**: JSON string containing experiment artifacts
|
61 |
-
- **logs**: JSON string containing experiment logs
|
62 |
-
- **last_updated**: Timestamp of last update
|
63 |
-
|
64 |
-
## Metrics Structure
|
65 |
-
|
66 |
-
The metrics field contains JSON arrays with the following structure:
|
67 |
-
|
68 |
-
```json
|
69 |
-
[
|
70 |
-
{
|
71 |
-
"timestamp": "2025-07-20T11:20:01.780908",
|
72 |
-
"step": 25,
|
73 |
-
"metrics": {
|
74 |
-
"loss": 1.1659,
|
75 |
-
"accuracy": 0.759,
|
76 |
-
"learning_rate": 7e-08,
|
77 |
-
"grad_norm": 10.3125,
|
78 |
-
"epoch": 0.004851130919895701,
|
79 |
-
|
80 |
-
// Advanced Training Metrics
|
81 |
-
"total_tokens": 1642080.0,
|
82 |
-
"truncated_tokens": 128,
|
83 |
-
"padding_tokens": 256,
|
84 |
-
"throughput": 3284160.0,
|
85 |
-
"step_time": 0.5,
|
86 |
-
"batch_size": 8,
|
87 |
-
"seq_len": 2048,
|
88 |
-
"token_acc": 0.759,
|
89 |
-
|
90 |
-
// Custom Losses
|
91 |
-
"train/gate_ortho": 0.0234,
|
92 |
-
"train/center": 0.0156,
|
93 |
-
|
94 |
-
// System Metrics
|
95 |
-
"gpu_memory_allocated": 17.202261447906494,
|
96 |
-
"gpu_memory_reserved": 75.474609375,
|
97 |
-
"gpu_utilization": 85.2,
|
98 |
-
"cpu_percent": 2.7,
|
99 |
-
"memory_percent": 10.1
|
100 |
-
}
|
101 |
-
}
|
102 |
-
]
|
103 |
-
```
|
104 |
-
|
105 |
-
## Supported Metrics
|
106 |
-
|
107 |
-
### Core Training Metrics
|
108 |
-
- **loss**: Training loss value
|
109 |
-
- **accuracy**: Model accuracy
|
110 |
-
- **learning_rate**: Current learning rate
|
111 |
-
- **grad_norm**: Gradient norm
|
112 |
-
- **epoch**: Current epoch progress
|
113 |
-
|
114 |
-
### Advanced Token Metrics
|
115 |
-
- **total_tokens**: Total tokens processed in the batch
|
116 |
-
- **truncated_tokens**: Number of tokens truncated during processing
|
117 |
-
- **padding_tokens**: Number of padding tokens added
|
118 |
-
- **throughput**: Tokens processed per second
|
119 |
-
- **step_time**: Time taken for the current training step
|
120 |
-
- **batch_size**: Current batch size
|
121 |
-
- **seq_len**: Sequence length
|
122 |
-
- **token_acc**: Token-level accuracy
|
123 |
-
|
124 |
-
### Custom Losses (SmolLM3-specific)
|
125 |
-
- **train/gate_ortho**: Gate orthogonality loss
|
126 |
-
- **train/center**: Center loss component
|
127 |
-
|
128 |
-
### System Performance Metrics
|
129 |
-
- **gpu_memory_allocated**: GPU memory currently allocated (GB)
|
130 |
-
- **gpu_memory_reserved**: GPU memory reserved (GB)
|
131 |
-
- **gpu_utilization**: GPU utilization percentage
|
132 |
-
- **cpu_percent**: CPU usage percentage
|
133 |
-
- **memory_percent**: System memory usage percentage
|
134 |
-
|
135 |
-
## Usage
|
136 |
-
|
137 |
-
This dataset is automatically used by the Trackio monitoring system to store and retrieve experiment data. It provides persistent storage for experiment tracking across different training runs.
|
138 |
-
|
139 |
-
## Integration
|
140 |
-
|
141 |
-
The dataset is used by:
|
142 |
-
- Trackio Spaces for experiment visualization
|
143 |
-
- Training scripts for logging metrics and parameters
|
144 |
-
- Monitoring systems for experiment tracking
|
145 |
-
- SmolLM3 fine-tuning pipeline for comprehensive metrics capture
|
146 |
-
|
147 |
-
## Privacy
|
148 |
-
|
149 |
-
This dataset is private by default to ensure experiment data security. Only users with appropriate permissions can access the data.
|
150 |
-
|
151 |
-
## Examples
|
152 |
-
|
153 |
-
### Sample Experiment Entry
|
154 |
-
```json
|
155 |
-
{
|
156 |
-
"experiment_id": "exp_20250720_130853",
|
157 |
-
"name": "smollm3_finetune",
|
158 |
-
"description": "SmolLM3 fine-tuning experiment with comprehensive metrics",
|
159 |
-
"created_at": "2025-07-20T11:20:01.780908",
|
160 |
-
"status": "running",
|
161 |
-
"metrics": "[{\"timestamp\": \"2025-07-20T11:20:01.780908\", \"step\": 25, \"metrics\": {\"loss\": 1.1659, \"accuracy\": 0.759, \"total_tokens\": 1642080.0, \"throughput\": 3284160.0, \"train/gate_ortho\": 0.0234, \"train/center\": 0.0156}}]",
|
162 |
-
"parameters": "{\"model_name\": \"HuggingFaceTB/SmolLM3-3B\", \"batch_size\": 8, \"learning_rate\": 3.5e-06, \"max_seq_length\": 12288}",
|
163 |
-
"artifacts": "[]",
|
164 |
-
"logs": "[]",
|
165 |
-
"last_updated": "2025-07-20T11:20:01.780908"
|
166 |
-
}
|
167 |
-
```
|
168 |
-
|
169 |
-
## License
|
170 |
-
|
171 |
-
This dataset is part of the Trackio experiment tracking system and follows the same license as the main project.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_hf_setup.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test Hugging Face Setup for Trackio Integration
|
4 |
+
|
5 |
+
This script helps verify your Hugging Face token setup and test space name generation.
|
6 |
+
Run this before using the training scripts to ensure everything is configured correctly.
|
7 |
+
|
8 |
+
Authentication:
|
9 |
+
This script only checks for HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variables.
|
10 |
+
It does NOT use huggingface-cli login state.
|
11 |
+
|
12 |
+
Setup:
|
13 |
+
Linux/Mac: export HF_TOKEN=your_token_here
|
14 |
+
Windows: set HF_TOKEN=your_token_here
|
15 |
+
Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
|
16 |
+
|
17 |
+
Get your token from: https://huggingface.co/settings/tokens
|
18 |
+
"""
|
19 |
+
|
20 |
+
import os
|
21 |
+
from datetime import datetime
|
22 |
+
from typing import Tuple, Optional
|
23 |
+
from huggingface_hub import HfApi
|
24 |
+
|
25 |
+
|
26 |
+
def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
|
27 |
+
"""
|
28 |
+
Validate a Hugging Face token and return the username.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
token (str): The Hugging Face token to validate
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
Tuple[bool, Optional[str], Optional[str]]:
|
35 |
+
- success: True if token is valid, False otherwise
|
36 |
+
- username: The username associated with the token (if valid)
|
37 |
+
- error_message: Error message if validation failed
|
38 |
+
"""
|
39 |
+
try:
|
40 |
+
# Create API client with token directly
|
41 |
+
api = HfApi(token=token)
|
42 |
+
|
43 |
+
# Try to get user info - this will fail if token is invalid
|
44 |
+
user_info = api.whoami()
|
45 |
+
|
46 |
+
# Extract username from user info
|
47 |
+
username = user_info.get("name", user_info.get("username"))
|
48 |
+
|
49 |
+
if not username:
|
50 |
+
return False, None, "Could not retrieve username from token"
|
51 |
+
|
52 |
+
return True, username, None
|
53 |
+
|
54 |
+
except Exception as e:
|
55 |
+
error_msg = str(e)
|
56 |
+
if "401" in error_msg or "unauthorized" in error_msg.lower():
|
57 |
+
return False, None, "Invalid token - unauthorized access"
|
58 |
+
elif "403" in error_msg:
|
59 |
+
return False, None, "Token lacks required permissions"
|
60 |
+
elif "network" in error_msg.lower() or "connection" in error_msg.lower():
|
61 |
+
return False, None, f"Network error: {error_msg}"
|
62 |
+
else:
|
63 |
+
return False, None, f"Validation error: {error_msg}"
|
64 |
+
|
65 |
+
|
66 |
+
def get_default_space_name(project_type: str = "voxtral-asr-finetuning") -> str:
|
67 |
+
"""
|
68 |
+
Generate a default space name with username and timestamp.
|
69 |
+
|
70 |
+
Args:
|
71 |
+
project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
str: Default space name in format "username/project-type-timestamp"
|
75 |
+
"""
|
76 |
+
try:
|
77 |
+
# Get token from environment variables only
|
78 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
79 |
+
|
80 |
+
if not token:
|
81 |
+
return None
|
82 |
+
|
83 |
+
# Validate token and get username
|
84 |
+
success, username, error = validate_hf_token(token)
|
85 |
+
if success and username:
|
86 |
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
87 |
+
return f"{username}/{project_type}-{timestamp}"
|
88 |
+
else:
|
89 |
+
return None
|
90 |
+
|
91 |
+
except Exception as e:
|
92 |
+
print(f"Failed to generate default space name: {e}")
|
93 |
+
return None
|
94 |
+
|
95 |
+
|
96 |
+
def main():
|
97 |
+
print("π Testing Hugging Face Setup for Trackio Integration")
|
98 |
+
print("=" * 60)
|
99 |
+
|
100 |
+
# Check for tokens
|
101 |
+
print("\n1. Checking for Hugging Face tokens...")
|
102 |
+
|
103 |
+
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
|
104 |
+
if token:
|
105 |
+
print(f"β
Found token in environment: {token[:10]}...")
|
106 |
+
else:
|
107 |
+
print("β No token found in environment variables")
|
108 |
+
print("\nβ No Hugging Face token found!")
|
109 |
+
print("Please set the HF_TOKEN environment variable:")
|
110 |
+
print(" Linux/Mac: export HF_TOKEN=your_token_here")
|
111 |
+
print(" Windows: set HF_TOKEN=your_token_here")
|
112 |
+
print(" Or: set HUGGINGFACE_HUB_TOKEN=your_token_here")
|
113 |
+
print("\nGet your token from: https://huggingface.co/settings/tokens")
|
114 |
+
return
|
115 |
+
|
116 |
+
# Validate token
|
117 |
+
print("\n2. Validating token...")
|
118 |
+
success, username, error = validate_hf_token(token)
|
119 |
+
|
120 |
+
if success:
|
121 |
+
print(f"β
Token is valid! Username: {username}")
|
122 |
+
else:
|
123 |
+
print(f"β Token validation failed: {error}")
|
124 |
+
return
|
125 |
+
|
126 |
+
# Generate space names
|
127 |
+
print("\n3. Generating default space names...")
|
128 |
+
|
129 |
+
full_finetune_space = get_default_space_name("voxtral-asr-finetuning")
|
130 |
+
lora_finetune_space = get_default_space_name("voxtral-lora-finetuning")
|
131 |
+
|
132 |
+
print(f"π Full fine-tuning space: {full_finetune_space}")
|
133 |
+
print(f"π LoRA fine-tuning space: {lora_finetune_space}")
|
134 |
+
|
135 |
+
print("\nβ
Setup complete! You can now run training scripts.")
|
136 |
+
print(" They will automatically use the generated space names.")
|
137 |
+
print("\nπ‘ To override the auto-generated names, use --trackio-space yourname/custom-space")
|
138 |
+
|
139 |
+
|
140 |
+
if __name__ == "__main__":
|
141 |
+
main()
|