Joseph Pollack commited on
Commit
676b3f3
Β·
unverified Β·
1 Parent(s): b9f51a0

adds automatic authentication , dataset readme , push to hub automation , demo , readme , and interface improvements

Browse files
interface.py CHANGED
@@ -155,6 +155,104 @@ def _save_uploaded_dataset(files: list, transcripts: list[str]) -> str:
155
  return str(jsonl_path)
156
 
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  def _save_recordings(recordings: list[tuple[int, list]], transcripts: list[str]) -> str:
159
  import soundfile as sf
160
  dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
@@ -231,6 +329,7 @@ def start_voxtral_training(
231
  repo_name = f"{username}/{repo_short}" if username else repo_short
232
  push_args = [
233
  str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
 
234
  str(output_dir),
235
  repo_name,
236
  ]
@@ -519,6 +618,7 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
519
  gr.update(visible=True), # dataset_status
520
  gr.update(visible=True), # advanced_accordion
521
  gr.update(visible=True), # save_rec_btn
 
522
  gr.update(visible=True), # start_btn
523
  gr.update(visible=True), # logs_box
524
  ]
@@ -607,17 +707,27 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
607
  gr.Markdown("### Upload audio + transcripts (optional)")
608
  upload_audio = gr.File(file_count="multiple", type="filepath", label="Upload WAV/FLAC files (optional)")
609
  transcripts_box = gr.Textbox(lines=6, label="Transcripts (one per line, aligned with files)")
 
 
610
  save_upload_btn = gr.Button("Save uploaded dataset")
 
611
 
612
  def _collect_upload(files, txt):
613
  lines = [s.strip() for s in (txt or "").splitlines() if s.strip()]
614
- return _save_uploaded_dataset(files or [], lines)
 
615
 
616
- # Removed - no longer needed since jsonl_out was removed
617
- # save_upload_btn.click(_collect_upload, [upload_audio, transcripts_box], [])
 
 
 
 
 
618
 
619
  # Save recordings button
620
  save_rec_btn = gr.Button("Save recordings as dataset", visible=False)
 
621
 
622
  def _collect_preloaded_recs(*recs_and_texts):
623
  import soundfile as sf
@@ -646,6 +756,13 @@ with gr.Blocks(title="Voxtral ASR Fine-tuning") as demo:
646
 
647
  save_rec_btn.click(_collect_preloaded_recs, rec_components + [phrase_texts_state], [jsonl_path_state])
648
 
 
 
 
 
 
 
 
649
  # Removed multilingual dataset sample section - phrases are now loaded automatically when language is selected
650
 
651
  start_btn = gr.Button("Start Fine-tuning", visible=False)
 
155
  return str(jsonl_path)
156
 
157
 
158
+ def _push_dataset_to_hub(jsonl_path: str, repo_name: str, username: str = "") -> str:
159
+ """Push dataset to Hugging Face Hub"""
160
+ try:
161
+ from huggingface_hub import HfApi, create_repo
162
+ import json
163
+ from pathlib import Path
164
+
165
+ token = os.getenv("HF_TOKEN") or os.getenv("HF_WRITE_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
166
+
167
+ if not token:
168
+ return "❌ No HF_TOKEN found. Set HF_TOKEN environment variable to push datasets."
169
+
170
+ api = HfApi(token=token)
171
+
172
+ # Determine full repo name
173
+ if "/" not in repo_name:
174
+ if not username:
175
+ user_info = api.whoami()
176
+ username = user_info.get("name") or user_info.get("username") or ""
177
+ if username:
178
+ repo_name = f"{username}/{repo_name}"
179
+
180
+ # Create dataset repository
181
+ try:
182
+ create_repo(repo_name, repo_type="dataset", token=token, exist_ok=True)
183
+ except Exception as e:
184
+ if "already exists" not in str(e).lower():
185
+ return f"❌ Failed to create dataset repo: {e}"
186
+
187
+ # Read the JSONL file
188
+ jsonl_file = Path(jsonl_path)
189
+ if not jsonl_file.exists():
190
+ return f"❌ Dataset file not found: {jsonl_path}"
191
+
192
+ # Upload the JSONL file
193
+ api.upload_file(
194
+ path_or_fileobj=str(jsonl_file),
195
+ path_in_repo="data.jsonl",
196
+ repo_id=repo_name,
197
+ repo_type="dataset",
198
+ token=token
199
+ )
200
+
201
+ # Create a simple README for the dataset
202
+ readme_content = f"""---
203
+ dataset_info:
204
+ features:
205
+ - name: audio_path
206
+ dtype: string
207
+ - name: text
208
+ dtype: string
209
+ splits:
210
+ - name: train
211
+ num_bytes: {jsonl_file.stat().st_size}
212
+ num_examples: {sum(1 for _ in open(jsonl_file))}
213
+ download_size: {jsonl_file.stat().st_size}
214
+ dataset_size: {jsonl_file.stat().st_size}
215
+ ---
216
+
217
+ # Voxtral ASR Dataset
218
+
219
+ This dataset was created using the Voxtral ASR Fine-tuning Interface.
220
+
221
+ ## Dataset Structure
222
+
223
+ - **audio_path**: Path to the audio file
224
+ - **text**: Transcription of the audio
225
+
226
+ ## Usage
227
+
228
+ ```python
229
+ from datasets import load_dataset
230
+
231
+ dataset = load_dataset("{repo_name}")
232
+ ```
233
+ """
234
+
235
+ # Upload README
236
+ readme_path = jsonl_file.parent / "README.md"
237
+ with open(readme_path, "w") as f:
238
+ f.write(readme_content)
239
+
240
+ api.upload_file(
241
+ path_or_fileobj=str(readme_path),
242
+ path_in_repo="README.md",
243
+ repo_id=repo_name,
244
+ repo_type="dataset",
245
+ token=token
246
+ )
247
+
248
+ readme_path.unlink() # Clean up temp file
249
+
250
+ return f"βœ… Dataset pushed to: https://huggingface.co/datasets/{repo_name}"
251
+
252
+ except Exception as e:
253
+ return f"❌ Failed to push dataset: {e}"
254
+
255
+
256
  def _save_recordings(recordings: list[tuple[int, list]], transcripts: list[str]) -> str:
257
  import soundfile as sf
258
  dataset_dir = PROJECT_ROOT / "datasets" / "voxtral_user"
 
329
  repo_name = f"{username}/{repo_short}" if username else repo_short
330
  push_args = [
331
  str(PROJECT_ROOT / "scripts/push_to_huggingface.py"),
332
+ "model",
333
  str(output_dir),
334
  repo_name,
335
  ]
 
618
  gr.update(visible=True), # dataset_status
619
  gr.update(visible=True), # advanced_accordion
620
  gr.update(visible=True), # save_rec_btn
621
+ gr.update(visible=True), # push_recordings_btn
622
  gr.update(visible=True), # start_btn
623
  gr.update(visible=True), # logs_box
624
  ]
 
707
  gr.Markdown("### Upload audio + transcripts (optional)")
708
  upload_audio = gr.File(file_count="multiple", type="filepath", label="Upload WAV/FLAC files (optional)")
709
  transcripts_box = gr.Textbox(lines=6, label="Transcripts (one per line, aligned with files)")
710
+ dataset_repo_name = gr.Textbox(value=f"voxtral-dataset-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
711
+ label="Dataset repo name (will be pushed to HF Hub)")
712
  save_upload_btn = gr.Button("Save uploaded dataset")
713
+ push_dataset_btn = gr.Button("Push dataset to HF Hub")
714
 
715
  def _collect_upload(files, txt):
716
  lines = [s.strip() for s in (txt or "").splitlines() if s.strip()]
717
+ jsonl_path = _save_uploaded_dataset(files or [], lines)
718
+ return f"βœ… Dataset saved locally: {jsonl_path}"
719
 
720
+ def _push_dataset_handler(repo_name):
721
+ if not jsonl_path_state.value:
722
+ return "❌ No dataset saved yet. Please save dataset first."
723
+ return _push_dataset_to_hub(jsonl_path_state.value, repo_name)
724
+
725
+ save_upload_btn.click(_collect_upload, [upload_audio, transcripts_box], [jsonl_path_state])
726
+ push_dataset_btn.click(_push_dataset_handler, [dataset_repo_name], [jsonl_path_state])
727
 
728
  # Save recordings button
729
  save_rec_btn = gr.Button("Save recordings as dataset", visible=False)
730
+ push_recordings_btn = gr.Button("Push recordings dataset to HF Hub", visible=False)
731
 
732
  def _collect_preloaded_recs(*recs_and_texts):
733
  import soundfile as sf
 
756
 
757
  save_rec_btn.click(_collect_preloaded_recs, rec_components + [phrase_texts_state], [jsonl_path_state])
758
 
759
+ def _push_recordings_handler(repo_name):
760
+ if not jsonl_path_state.value:
761
+ return "❌ No recordings dataset saved yet. Please save recordings first."
762
+ return _push_dataset_to_hub(jsonl_path_state.value, repo_name)
763
+
764
+ push_recordings_btn.click(_push_recordings_handler, [dataset_repo_name], [jsonl_path_state])
765
+
766
  # Removed multilingual dataset sample section - phrases are now loaded automatically when language is selected
767
 
768
  start_btn = gr.Button("Start Fine-tuning", visible=False)
requirements.txt CHANGED
@@ -2,4 +2,6 @@ torch
2
  datasets
3
  peft
4
  transformers
5
- gradio
 
 
 
2
  datasets
3
  peft
4
  transformers
5
+ gradio
6
+ trackio
7
+ huggingface_hub
scripts/push_to_huggingface.py CHANGED
@@ -1,20 +1,26 @@
1
  #!/usr/bin/env python3
2
  """
3
- Push Trained Model and Results to Hugging Face Hub
4
- Integrates with Trackio monitoring and HF Datasets for complete model deployment
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
7
  import os
8
  import json
9
  import argparse
10
  import logging
11
- import time
12
  from pathlib import Path
13
- from typing import Dict, Any, Optional, List
14
  from datetime import datetime
15
- import subprocess
16
- import shutil
17
- import platform
18
 
19
  # Set timeout for HF operations to prevent hanging
20
  os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
@@ -22,34 +28,15 @@ os.environ['HF_HUB_UPLOAD_TIMEOUT'] = '600'
22
 
23
  try:
24
  from huggingface_hub import HfApi, create_repo, upload_file
25
- from huggingface_hub import snapshot_download, hf_hub_download
26
  HF_AVAILABLE = True
27
  except ImportError:
28
  HF_AVAILABLE = False
29
  print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
30
 
31
- try:
32
- import sys
33
- import os
34
- sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
35
- from monitoring import SmolLM3Monitor
36
- MONITORING_AVAILABLE = True
37
- except ImportError:
38
- MONITORING_AVAILABLE = False
39
- print("Warning: monitoring module not available")
40
-
41
  logger = logging.getLogger(__name__)
42
 
43
- class TimeoutError(Exception):
44
- """Custom timeout exception"""
45
- pass
46
-
47
- def timeout_handler(signum, frame):
48
- """Signal handler for timeout"""
49
- raise TimeoutError("Operation timed out")
50
-
51
  class HuggingFacePusher:
52
- """Push trained models and results to Hugging Face Hub with HF Datasets integration"""
53
 
54
  def __init__(
55
  self,
@@ -57,44 +44,22 @@ class HuggingFacePusher:
57
  repo_name: str,
58
  token: Optional[str] = None,
59
  private: bool = False,
60
- trackio_url: Optional[str] = None,
61
- experiment_name: Optional[str] = None,
62
- dataset_repo: Optional[str] = None,
63
- hf_token: Optional[str] = None,
64
  author_name: Optional[str] = None,
65
  model_description: Optional[str] = None,
66
- training_config_type: Optional[str] = None,
67
  model_name: Optional[str] = None,
68
- dataset_name: Optional[str] = None,
69
- batch_size: Optional[str] = None,
70
- learning_rate: Optional[str] = None,
71
- max_epochs: Optional[str] = None,
72
- max_seq_length: Optional[str] = None,
73
- trainer_type: Optional[str] = None
74
  ):
75
  self.model_path = Path(model_path)
76
  # Original user input (may be just the repo name without username)
77
  self.repo_name = repo_name
78
- self.token = token or hf_token or os.getenv('HF_TOKEN')
79
  self.private = private
80
- self.trackio_url = trackio_url
81
- self.experiment_name = experiment_name
82
  self.author_name = author_name
83
  self.model_description = model_description
84
-
85
- # Training configuration details for model card generation
86
- self.training_config_type = training_config_type
87
- self.model_name = model_name
88
  self.dataset_name = dataset_name
89
- self.batch_size = batch_size
90
- self.learning_rate = learning_rate
91
- self.max_epochs = max_epochs
92
- self.max_seq_length = max_seq_length
93
- self.trainer_type = trainer_type
94
-
95
- # HF Datasets configuration
96
- self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
97
- self.hf_token = hf_token or os.getenv('HF_TOKEN')
98
 
99
  # Initialize HF API
100
  if HF_AVAILABLE:
@@ -105,19 +70,7 @@ class HuggingFacePusher:
105
  # Resolve the full repo id (username/repo) if user only provided repo name
106
  self.repo_id = self._resolve_repo_id(self.repo_name)
107
 
108
- # Initialize monitoring if available
109
- self.monitor = None
110
- if MONITORING_AVAILABLE:
111
- self.monitor = SmolLM3Monitor(
112
- experiment_name=experiment_name or "model_push",
113
- trackio_url=trackio_url,
114
- enable_tracking=bool(trackio_url),
115
- hf_token=self.hf_token,
116
- dataset_repo=self.dataset_repo
117
- )
118
-
119
  logger.info(f"Initialized HuggingFacePusher for {self.repo_id}")
120
- logger.info(f"Dataset repository: {self.dataset_repo}")
121
 
122
  def _resolve_repo_id(self, repo_name: str) -> str:
123
  """Return a fully-qualified repo id in the form username/repo.
@@ -515,59 +468,33 @@ MIT License
515
  logger.error(f"❌ Failed to create README: {e}")
516
  return False
517
 
518
- def log_to_trackio(self, action: str, details: Dict[str, Any]):
519
- """Log push action to Trackio and HF Datasets"""
520
- if self.monitor:
521
- try:
522
- # Log to Trackio
523
- self.monitor.log_metrics({
524
- "push_action": action,
525
- "repo_name": self.repo_id,
526
- "model_size_gb": self._get_model_size(),
527
- "dataset_repo": self.dataset_repo,
528
- **details
529
- })
530
-
531
- # Log training summary
532
- self.monitor.log_training_summary({
533
- "model_push": True,
534
- "model_repo": self.repo_id,
535
- "dataset_repo": self.dataset_repo,
536
- "push_date": datetime.now().isoformat(),
537
- **details
538
- })
539
-
540
- logger.info(f"βœ… Logged {action} to Trackio and HF Datasets")
541
- except Exception as e:
542
- logger.error(f"❌ Failed to log to Trackio: {e}")
543
-
544
- def push_model(self, training_config: Optional[Dict[str, Any]] = None,
545
  results: Optional[Dict[str, Any]] = None) -> bool:
546
- """Complete model push process with HF Datasets integration"""
547
  logger.info(f"πŸš€ Starting model push to {self.repo_id}")
548
- logger.info(f"πŸ“Š Dataset repository: {self.dataset_repo}")
549
-
550
  # Validate model path
551
  if not self.validate_model_path():
552
  return False
553
-
554
  # Create repository
555
  if not self.create_repository():
556
  return False
557
-
558
  # Load training config and results if not provided
559
  if training_config is None:
560
  training_config = self._load_training_config()
561
-
562
  if results is None:
563
  results = self._load_training_results()
564
-
565
  # Create and upload model card
566
  model_card = self.create_model_card(training_config, results)
567
  model_card_path = Path("temp_model_card.md")
568
  with open(model_card_path, "w") as f:
569
  f.write(model_card)
570
-
571
  try:
572
  upload_file(
573
  path_or_fileobj=str(model_card_path),
@@ -577,27 +504,135 @@ MIT License
577
  )
578
  finally:
579
  model_card_path.unlink()
580
-
581
  # Upload model files
582
  if not self.upload_model_files():
583
  return False
584
-
585
  # Upload training results
586
  if results:
587
  self.upload_training_results(str(self.model_path))
588
-
589
- # Log to Trackio and HF Datasets
590
- self.log_to_trackio("model_push", {
591
- "model_path": str(self.model_path),
592
- "repo_name": self.repo_name,
593
- "private": self.private,
594
- "training_config": training_config,
595
- "results": results
596
- })
597
-
598
  logger.info(f"πŸŽ‰ Model successfully pushed to: https://huggingface.co/{self.repo_id}")
599
- logger.info(f"πŸ“Š Experiment data stored in: {self.dataset_repo}")
600
  return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
 
602
  def _load_training_config(self) -> Dict[str, Any]:
603
  """Load training configuration"""
@@ -619,81 +654,94 @@ def parse_args():
619
  """Parse command line arguments"""
620
  parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')
621
 
622
- # Required arguments
623
- parser.add_argument('model_path', type=str, help='Path to trained model directory')
624
- parser.add_argument('repo_name', type=str, help='Hugging Face repository name (repo-name). Username will be auto-detected from your token.')
625
-
626
- # Optional arguments
627
- parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
628
- parser.add_argument('--hf-token', type=str, default=None, help='Hugging Face token (alternative to --token)')
629
- parser.add_argument('--private', action='store_true', help='Make repository private')
630
- parser.add_argument('--trackio-url', type=str, default=None, help='Trackio Space URL for logging')
631
- parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for Trackio')
632
- parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
633
- parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
634
- parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
635
- parser.add_argument('--training-config-type', type=str, default=None, help='Training configuration type')
636
- parser.add_argument('--model-name', type=str, default=None, help='Base model name')
637
- parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
638
- parser.add_argument('--batch-size', type=str, default=None, help='Batch size')
639
- parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate')
640
- parser.add_argument('--max-epochs', type=str, default=None, help='Maximum epochs')
641
- parser.add_argument('--max-seq-length', type=str, default=None, help='Maximum sequence length')
642
- parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type')
643
 
644
  return parser.parse_args()
645
 
646
  def main():
647
  """Main function"""
648
  args = parse_args()
649
-
650
  # Setup logging
651
  logging.basicConfig(
652
  level=logging.INFO,
653
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
654
  )
655
-
656
- logger.info("Starting model push to Hugging Face Hub")
657
-
658
- # Initialize pusher
 
659
  try:
660
- pusher = HuggingFacePusher(
661
- model_path=args.model_path,
662
- repo_name=args.repo_name,
663
- token=args.token,
664
- private=args.private,
665
- trackio_url=args.trackio_url,
666
- experiment_name=args.experiment_name,
667
- dataset_repo=args.dataset_repo,
668
- hf_token=args.hf_token,
669
- author_name=args.author_name,
670
- model_description=args.model_description,
671
- training_config_type=args.training_config_type,
672
- model_name=args.model_name,
673
- dataset_name=args.dataset_name,
674
- batch_size=args.batch_size,
675
- learning_rate=args.learning_rate,
676
- max_epochs=args.max_epochs,
677
- max_seq_length=args.max_seq_length,
678
- trainer_type=args.trainer_type
679
- )
680
-
681
- # Push model
682
- success = pusher.push_model()
683
-
684
- if success:
685
- logger.info("βœ… Model push completed successfully!")
686
- logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
687
- if args.dataset_repo:
688
- logger.info(f"πŸ“Š View experiment data at: https://huggingface.co/datasets/{args.dataset_repo}")
689
- else:
690
- logger.error("❌ Model push failed!")
691
- return 1
692
-
 
 
 
 
 
 
 
 
 
 
 
 
 
693
  except Exception as e:
694
- logger.error(f"❌ Error during model push: {e}")
695
  return 1
696
-
697
  return 0
698
 
699
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
  """
3
+ Push Trained Models and Datasets to Hugging Face Hub
4
+
5
+ Usage:
6
+ # Push a trained model
7
+ python push_to_huggingface.py model /path/to/model my-model-repo
8
+
9
+ # Push a dataset
10
+ python push_to_huggingface.py dataset /path/to/dataset.jsonl my-dataset-repo
11
+
12
+ Authentication:
13
+ Set HF_TOKEN environment variable or use --token:
14
+ export HF_TOKEN=your_token_here
15
  """
16
 
17
  import os
18
  import json
19
  import argparse
20
  import logging
 
21
  from pathlib import Path
22
+ from typing import Dict, Any, Optional
23
  from datetime import datetime
 
 
 
24
 
25
  # Set timeout for HF operations to prevent hanging
26
  os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
 
28
 
29
  try:
30
  from huggingface_hub import HfApi, create_repo, upload_file
 
31
  HF_AVAILABLE = True
32
  except ImportError:
33
  HF_AVAILABLE = False
34
  print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
35
 
 
 
 
 
 
 
 
 
 
 
36
  logger = logging.getLogger(__name__)
37
 
 
 
 
 
 
 
 
 
38
  class HuggingFacePusher:
39
+ """Push trained models to Hugging Face Hub"""
40
 
41
  def __init__(
42
  self,
 
44
  repo_name: str,
45
  token: Optional[str] = None,
46
  private: bool = False,
 
 
 
 
47
  author_name: Optional[str] = None,
48
  model_description: Optional[str] = None,
 
49
  model_name: Optional[str] = None,
50
+ dataset_name: Optional[str] = None
 
 
 
 
 
51
  ):
52
  self.model_path = Path(model_path)
53
  # Original user input (may be just the repo name without username)
54
  self.repo_name = repo_name
55
+ self.token = token or os.getenv('HF_TOKEN')
56
  self.private = private
 
 
57
  self.author_name = author_name
58
  self.model_description = model_description
59
+
60
+ # Model card generation details
61
+ self.model_name = model_name
 
62
  self.dataset_name = dataset_name
 
 
 
 
 
 
 
 
 
63
 
64
  # Initialize HF API
65
  if HF_AVAILABLE:
 
70
  # Resolve the full repo id (username/repo) if user only provided repo name
71
  self.repo_id = self._resolve_repo_id(self.repo_name)
72
 
 
 
 
 
 
 
 
 
 
 
 
73
  logger.info(f"Initialized HuggingFacePusher for {self.repo_id}")
 
74
 
75
  def _resolve_repo_id(self, repo_name: str) -> str:
76
  """Return a fully-qualified repo id in the form username/repo.
 
468
  logger.error(f"❌ Failed to create README: {e}")
469
  return False
470
 
471
+
472
+ def push_model(self, training_config: Optional[Dict[str, Any]] = None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  results: Optional[Dict[str, Any]] = None) -> bool:
474
+ """Complete model push process"""
475
  logger.info(f"πŸš€ Starting model push to {self.repo_id}")
476
+
 
477
  # Validate model path
478
  if not self.validate_model_path():
479
  return False
480
+
481
  # Create repository
482
  if not self.create_repository():
483
  return False
484
+
485
  # Load training config and results if not provided
486
  if training_config is None:
487
  training_config = self._load_training_config()
488
+
489
  if results is None:
490
  results = self._load_training_results()
491
+
492
  # Create and upload model card
493
  model_card = self.create_model_card(training_config, results)
494
  model_card_path = Path("temp_model_card.md")
495
  with open(model_card_path, "w") as f:
496
  f.write(model_card)
497
+
498
  try:
499
  upload_file(
500
  path_or_fileobj=str(model_card_path),
 
504
  )
505
  finally:
506
  model_card_path.unlink()
507
+
508
  # Upload model files
509
  if not self.upload_model_files():
510
  return False
511
+
512
  # Upload training results
513
  if results:
514
  self.upload_training_results(str(self.model_path))
515
+
516
+ # Log success
517
+ logger.info(f"βœ… Model successfully pushed to {self.repo_id}")
 
 
 
 
 
 
 
518
  logger.info(f"πŸŽ‰ Model successfully pushed to: https://huggingface.co/{self.repo_id}")
519
+
520
  return True
521
+
522
+ def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool:
523
+ """Push dataset to Hugging Face Hub"""
524
+ logger.info(f"πŸš€ Starting dataset push to {dataset_repo_name}")
525
+
526
+ try:
527
+ from huggingface_hub import create_repo
528
+ import json
529
+
530
+ # Determine full dataset repo name
531
+ if "/" not in dataset_repo_name:
532
+ dataset_repo_name = f"{self.repo_id.split('/')[0]}/{dataset_repo_name}"
533
+
534
+ # Create dataset repository
535
+ try:
536
+ create_repo(dataset_repo_name, repo_type="dataset", token=self.token, exist_ok=True)
537
+ logger.info(f"βœ… Created dataset repository: {dataset_repo_name}")
538
+ except Exception as e:
539
+ if "already exists" not in str(e).lower():
540
+ logger.error(f"❌ Failed to create dataset repo: {e}")
541
+ return False
542
+ logger.info(f"πŸ“ Dataset repository already exists: {dataset_repo_name}")
543
+
544
+ # Read the dataset file
545
+ dataset_file = Path(dataset_path)
546
+ if not dataset_file.exists():
547
+ logger.error(f"❌ Dataset file not found: {dataset_path}")
548
+ return False
549
+
550
+ # Count lines for metadata
551
+ with open(dataset_file, 'r', encoding='utf-8') as f:
552
+ num_examples = sum(1 for _ in f)
553
+
554
+ file_size = dataset_file.stat().st_size
555
+
556
+ # Upload the dataset file
557
+ upload_file(
558
+ path_or_fileobj=str(dataset_file),
559
+ path_in_repo="data.jsonl",
560
+ repo_id=dataset_repo_name,
561
+ repo_type="dataset",
562
+ token=self.token
563
+ )
564
+ logger.info(f"βœ… Uploaded dataset file: {dataset_file.name}")
565
+
566
+ # Create a dataset README
567
+ readme_content = f"""---
568
+ dataset_info:
569
+ features:
570
+ - name: audio_path
571
+ dtype: string
572
+ - name: text
573
+ dtype: string
574
+ splits:
575
+ - name: train
576
+ num_bytes: {file_size}
577
+ num_examples: {num_examples}
578
+ download_size: {file_size}
579
+ dataset_size: {file_size}
580
+ tags:
581
+ - voxtral
582
+ - asr
583
+ - fine-tuning
584
+ - conversational
585
+ - speech-to-text
586
+ - audio-to-text
587
+ - tonic
588
+ ---
589
+
590
+ # Voxtral ASR Dataset
591
+
592
+ This dataset was created for fine-tuning Voxtral ASR models.
593
+
594
+ ## Dataset Structure
595
+
596
+ - **audio_path**: Path to the audio file
597
+ - **text**: Transcription of the audio
598
+
599
+ ## Statistics
600
+
601
+ - Number of examples: {num_examples}
602
+ - File size: {file_size} bytes
603
+
604
+ ## Usage
605
+
606
+ ```python
607
+ from datasets import load_dataset
608
+
609
+ dataset = load_dataset("{dataset_repo_name}")
610
+ ```
611
+ """
612
+
613
+ # Upload README
614
+ readme_path = dataset_file.parent / "README.md"
615
+ with open(readme_path, "w") as f:
616
+ f.write(readme_content)
617
+
618
+ upload_file(
619
+ path_or_fileobj=str(readme_path),
620
+ path_in_repo="README.md",
621
+ repo_id=dataset_repo_name,
622
+ repo_type="dataset",
623
+ token=self.token
624
+ )
625
+
626
+ readme_path.unlink() # Clean up temp file
627
+
628
+ logger.info(f"βœ… Dataset README uploaded")
629
+ logger.info(f"πŸŽ‰ Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}")
630
+
631
+ return True
632
+
633
+ except Exception as e:
634
+ logger.error(f"❌ Failed to push dataset: {e}")
635
+ return False
636
 
637
  def _load_training_config(self) -> Dict[str, Any]:
638
  """Load training configuration"""
 
654
  """Parse command line arguments"""
655
  parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')
656
 
657
+ # Subcommands
658
+ subparsers = parser.add_subparsers(dest='command', help='Available commands')
659
+
660
+ # Model push subcommand
661
+ model_parser = subparsers.add_parser('model', help='Push trained model to Hugging Face Hub')
662
+ model_parser.add_argument('model_path', type=str, help='Path to trained model directory')
663
+ model_parser.add_argument('repo_name', type=str, help='Hugging Face repository name (repo-name). Username will be auto-detected from your token.')
664
+ model_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
665
+ model_parser.add_argument('--private', action='store_true', help='Make repository private')
666
+ model_parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
667
+ model_parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
668
+ model_parser.add_argument('--model-name', type=str, default=None, help='Base model name')
669
+ model_parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
670
+
671
+ # Dataset push subcommand
672
+ dataset_parser = subparsers.add_parser('dataset', help='Push dataset to Hugging Face Hub')
673
+ dataset_parser.add_argument('dataset_path', type=str, help='Path to dataset JSONL file')
674
+ dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name')
675
+ dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
676
+ dataset_parser.add_argument('--private', action='store_true', help='Make repository private')
 
677
 
678
  return parser.parse_args()
679
 
680
  def main():
681
  """Main function"""
682
  args = parse_args()
683
+
684
  # Setup logging
685
  logging.basicConfig(
686
  level=logging.INFO,
687
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
688
  )
689
+
690
+ if not args.command:
691
+ logger.error("❌ No command specified. Use 'model' or 'dataset' subcommand.")
692
+ return 1
693
+
694
  try:
695
+ if args.command == 'model':
696
+ logger.info("Starting model push to Hugging Face Hub")
697
+
698
+ # Initialize pusher
699
+ pusher = HuggingFacePusher(
700
+ model_path=args.model_path,
701
+ repo_name=args.repo_name,
702
+ token=args.token,
703
+ private=args.private,
704
+ author_name=args.author_name,
705
+ model_description=args.model_description,
706
+ model_name=args.model_name,
707
+ dataset_name=args.dataset_name
708
+ )
709
+
710
+ # Push model
711
+ success = pusher.push_model()
712
+
713
+ if success:
714
+ logger.info("βœ… Model push completed successfully!")
715
+ logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
716
+ else:
717
+ logger.error("❌ Model push failed!")
718
+ return 1
719
+
720
+ elif args.command == 'dataset':
721
+ logger.info("Starting dataset push to Hugging Face Hub")
722
+
723
+ # Initialize pusher for dataset
724
+ pusher = HuggingFacePusher(
725
+ model_path="", # Not needed for dataset push
726
+ repo_name=args.repo_name,
727
+ token=args.token,
728
+ private=args.private
729
+ )
730
+
731
+ # Push dataset
732
+ success = pusher.push_dataset(args.dataset_path, args.repo_name)
733
+
734
+ if success:
735
+ logger.info("βœ… Dataset push completed successfully!")
736
+ logger.info(f"πŸ“Š View your dataset at: https://huggingface.co/datasets/{args.repo_name}")
737
+ else:
738
+ logger.error("❌ Dataset push failed!")
739
+ return 1
740
+
741
  except Exception as e:
742
+ logger.error(f"❌ Error during push: {e}")
743
  return 1
744
+
745
  return 0
746
 
747
  if __name__ == "__main__":
scripts/train.py CHANGED
@@ -1,8 +1,31 @@
1
  #!/usr/bin/env python3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import argparse
4
  import json
5
  from pathlib import Path
 
 
6
  import torch
7
  from datasets import load_dataset, Audio, Dataset
8
  from transformers import (
@@ -11,6 +34,85 @@ from transformers import (
11
  Trainer,
12
  TrainingArguments,
13
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  class VoxtralDataCollator:
@@ -161,6 +263,12 @@ def main():
161
  parser.add_argument("--epochs", type=float, default=3)
162
  parser.add_argument("--logging-steps", type=int, default=10)
163
  parser.add_argument("--save-steps", type=int, default=50)
 
 
 
 
 
 
164
  args = parser.parse_args()
165
 
166
  model_checkpoint = args.model_checkpoint
@@ -169,6 +277,48 @@ def main():
169
  torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
170
  print(f"Using device: {torch_device}")
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  print("Loading processor and model...")
173
  processor = VoxtralProcessor.from_pretrained(model_checkpoint)
174
  model = VoxtralForConditionalGeneration.from_pretrained(
@@ -200,7 +350,7 @@ def main():
200
  save_steps=args.save_steps,
201
  eval_strategy="steps" if eval_dataset else "no",
202
  save_strategy="steps",
203
- report_to="none",
204
  remove_unused_columns=False,
205
  dataloader_num_workers=1,
206
  )
@@ -223,6 +373,44 @@ def main():
223
  if eval_dataset:
224
  results = trainer.evaluate()
225
  print(f"Final evaluation results: {results}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  print("Training completed successfully!")
228
 
 
1
  #!/usr/bin/env python3
2
+ """
3
+ Voxtral ASR Full Fine-tuning Script with Trackio Integration
4
+
5
+ This script fine-tunes Voxtral models for ASR tasks with automatic experiment tracking
6
+ via Trackio and Hugging Face Spaces.
7
+
8
+ Features:
9
+ - Automatic username detection from HF_TOKEN environment variable
10
+ - Auto-generated space names with timestamps
11
+ - Local-only mode when no HF_TOKEN is set
12
+ - Comprehensive experiment logging
13
+ - Optional dataset pushing to Hugging Face Hub
14
+
15
+ Authentication:
16
+ Set HF_TOKEN environment variable to enable automatic space creation:
17
+ Linux/Mac: export HF_TOKEN=your_token_here
18
+ Windows: set HF_TOKEN=your_token_here
19
+ Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
20
+
21
+ Get your token from: https://huggingface.co/settings/tokens
22
+ """
23
 
24
  import argparse
25
  import json
26
  from pathlib import Path
27
+ from datetime import datetime
28
+ from typing import Tuple, Optional
29
  import torch
30
  from datasets import load_dataset, Audio, Dataset
31
  from transformers import (
 
34
  Trainer,
35
  TrainingArguments,
36
  )
37
+ from huggingface_hub import HfApi
38
+ import trackio
39
+
40
+
41
+ def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
42
+ """
43
+ Validate a Hugging Face token and return the username.
44
+
45
+ Args:
46
+ token (str): The Hugging Face token to validate
47
+
48
+ Returns:
49
+ Tuple[bool, Optional[str], Optional[str]]:
50
+ - success: True if token is valid, False otherwise
51
+ - username: The username associated with the token (if valid)
52
+ - error_message: Error message if validation failed
53
+ """
54
+ try:
55
+ # Create API client with token directly
56
+ api = HfApi(token=token)
57
+
58
+ # Try to get user info - this will fail if token is invalid
59
+ user_info = api.whoami()
60
+
61
+ # Extract username from user info
62
+ username = user_info.get("name", user_info.get("username"))
63
+
64
+ if not username:
65
+ return False, None, "Could not retrieve username from token"
66
+
67
+ return True, username, None
68
+
69
+ except Exception as e:
70
+ error_msg = str(e)
71
+ if "401" in error_msg or "unauthorized" in error_msg.lower():
72
+ return False, None, "Invalid token - unauthorized access"
73
+ elif "403" in error_msg:
74
+ return False, None, "Token lacks required permissions"
75
+ elif "network" in error_msg.lower() or "connection" in error_msg.lower():
76
+ return False, None, f"Network error: {error_msg}"
77
+ else:
78
+ return False, None, f"Validation error: {error_msg}"
79
+
80
+
81
+ def get_default_space_name(project_type: str = "voxtral-asr-finetuning") -> str:
82
+ """
83
+ Generate a default space name with username and timestamp.
84
+
85
+ Args:
86
+ project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
87
+
88
+ Returns:
89
+ str: Default space name in format "username/project-type-timestamp"
90
+ """
91
+ try:
92
+ # Get token from environment variables only
93
+ import os
94
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
95
+
96
+ if not token:
97
+ print("Warning: No HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable found.")
98
+ print("Set HF_TOKEN environment variable to enable automatic space creation.")
99
+ print("Example: export HF_TOKEN=your_token_here")
100
+ print("Falling back to local-only mode.")
101
+ return None
102
+
103
+ # Validate token and get username
104
+ success, username, error = validate_hf_token(token)
105
+ if success and username:
106
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
107
+ return f"{username}/{project_type}-{timestamp}"
108
+ else:
109
+ print(f"Warning: Token validation failed: {error}")
110
+ print("Falling back to local-only mode.")
111
+ return None
112
+
113
+ except Exception as e:
114
+ print(f"Warning: Failed to generate default space name: {e}")
115
+ return None
116
 
117
 
118
  class VoxtralDataCollator:
 
263
  parser.add_argument("--epochs", type=float, default=3)
264
  parser.add_argument("--logging-steps", type=int, default=10)
265
  parser.add_argument("--save-steps", type=int, default=50)
266
+ parser.add_argument("--trackio-space", type=str, default=None,
267
+ help="Hugging Face Space ID for trackio logging (format: username/space-name). If not provided, will auto-generate based on HF token")
268
+ parser.add_argument("--push-dataset", action="store_true",
269
+ help="Push the training dataset to Hugging Face Hub after training")
270
+ parser.add_argument("--dataset-repo", type=str, default=None,
271
+ help="Dataset repository name for pushing dataset (format: username/dataset-name)")
272
  args = parser.parse_args()
273
 
274
  model_checkpoint = args.model_checkpoint
 
277
  torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
278
  print(f"Using device: {torch_device}")
279
 
280
+ # Determine trackio space
281
+ trackio_space = args.trackio_space
282
+ if not trackio_space:
283
+ trackio_space = get_default_space_name("voxtral-asr-finetuning")
284
+
285
+ # Initialize trackio for experiment tracking
286
+ if trackio_space:
287
+ print(f"Initializing trackio with space: {trackio_space}")
288
+ trackio.init(
289
+ project="voxtral-finetuning",
290
+ config={
291
+ "model_checkpoint": model_checkpoint,
292
+ "output_dir": output_dir,
293
+ "batch_size": args.batch_size,
294
+ "learning_rate": args.learning_rate,
295
+ "epochs": args.epochs,
296
+ "train_count": args.train_count,
297
+ "eval_count": args.eval_count,
298
+ "dataset_jsonl": args.dataset_jsonl,
299
+ "dataset_name": args.dataset_name,
300
+ "dataset_config": args.dataset_config,
301
+ },
302
+ space_id=trackio_space
303
+ )
304
+ else:
305
+ print("Initializing trackio in local-only mode")
306
+ trackio.init(
307
+ project="voxtral-finetuning",
308
+ config={
309
+ "model_checkpoint": model_checkpoint,
310
+ "output_dir": output_dir,
311
+ "batch_size": args.batch_size,
312
+ "learning_rate": args.learning_rate,
313
+ "epochs": args.epochs,
314
+ "train_count": args.train_count,
315
+ "eval_count": args.eval_count,
316
+ "dataset_jsonl": args.dataset_jsonl,
317
+ "dataset_name": args.dataset_name,
318
+ "dataset_config": args.dataset_config,
319
+ }
320
+ )
321
+
322
  print("Loading processor and model...")
323
  processor = VoxtralProcessor.from_pretrained(model_checkpoint)
324
  model = VoxtralForConditionalGeneration.from_pretrained(
 
350
  save_steps=args.save_steps,
351
  eval_strategy="steps" if eval_dataset else "no",
352
  save_strategy="steps",
353
+ report_to=["trackio"],
354
  remove_unused_columns=False,
355
  dataloader_num_workers=1,
356
  )
 
373
  if eval_dataset:
374
  results = trainer.evaluate()
375
  print(f"Final evaluation results: {results}")
376
+ # Log final evaluation results
377
+ trackio.log(results)
378
+
379
+ # Push dataset to Hub if requested
380
+ if args.push_dataset and args.dataset_jsonl:
381
+ print("Pushing dataset to Hugging Face Hub...")
382
+ try:
383
+ from pathlib import Path
384
+ import subprocess
385
+
386
+ dataset_repo = args.dataset_repo
387
+ if not dataset_repo:
388
+ # Auto-generate dataset repo name
389
+ if trackio_space:
390
+ username = trackio_space.split('/')[0]
391
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
392
+ dataset_repo = f"{username}/voxtral-dataset-{timestamp}"
393
+ else:
394
+ print("Warning: Cannot auto-generate dataset repo name without HF token")
395
+ dataset_repo = f"voxtral-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
396
+
397
+ # Call the push script
398
+ push_cmd = [
399
+ "python", str(Path(__file__).parent / "push_to_huggingface.py"),
400
+ "dataset", args.dataset_jsonl, dataset_repo
401
+ ]
402
+
403
+ result = subprocess.run(push_cmd, capture_output=True, text=True)
404
+ if result.returncode == 0:
405
+ print(f"βœ… Dataset pushed to: https://huggingface.co/datasets/{dataset_repo}")
406
+ else:
407
+ print(f"❌ Failed to push dataset: {result.stderr}")
408
+
409
+ except Exception as e:
410
+ print(f"❌ Error pushing dataset: {e}")
411
+
412
+ # Finish trackio logging
413
+ trackio.finish()
414
 
415
  print("Training completed successfully!")
416
 
scripts/train_lora.py CHANGED
@@ -1,8 +1,32 @@
1
  #!/usr/bin/env python3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import argparse
4
  import json
5
  from pathlib import Path
 
 
6
  import torch
7
  from datasets import load_dataset, Audio, Dataset
8
  from transformers import (
@@ -12,6 +36,85 @@ from transformers import (
12
  TrainingArguments,
13
  )
14
  from peft import LoraConfig, get_peft_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  class VoxtralDataCollator:
@@ -163,6 +266,12 @@ def main():
163
  parser.add_argument("--lora-alpha", type=int, default=32)
164
  parser.add_argument("--lora-dropout", type=float, default=0.0)
165
  parser.add_argument("--freeze-audio-tower", action="store_true", help="Freeze audio encoder parameters")
 
 
 
 
 
 
166
  args = parser.parse_args()
167
 
168
  model_checkpoint = args.model_checkpoint
@@ -171,6 +280,56 @@ def main():
171
  torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
172
  print(f"Using device: {torch_device}")
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  print("Loading processor and model...")
175
  processor = VoxtralProcessor.from_pretrained(model_checkpoint)
176
  lora_cfg = LoraConfig(
@@ -210,12 +369,12 @@ def main():
210
  learning_rate=args.learning_rate,
211
  num_train_epochs=args.epochs,
212
  bf16=True,
213
- logging_steps=args.logging_issues if hasattr(args, 'logging_issues') else args.logging_steps,
214
  eval_steps=args.save_steps if eval_dataset else None,
215
  save_steps=args.save_steps,
216
  eval_strategy="steps" if eval_dataset else "no",
217
  save_strategy="steps",
218
- report_to="none",
219
  remove_unused_columns=False,
220
  dataloader_num_workers=1,
221
  )
@@ -238,6 +397,44 @@ def main():
238
  if eval_dataset:
239
  results = trainer.evaluate()
240
  print(f"Final evaluation results: {results}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  print("Training completed successfully!")
243
 
 
1
  #!/usr/bin/env python3
2
+ """
3
+ Voxtral ASR LoRA Fine-tuning Script with Trackio Integration
4
+
5
+ This script fine-tunes Voxtral models using LoRA for ASR tasks with automatic experiment tracking
6
+ via Trackio and Hugging Face Spaces.
7
+
8
+ Features:
9
+ - Automatic username detection from HF_TOKEN environment variable
10
+ - Auto-generated space names with timestamps
11
+ - Local-only mode when no HF_TOKEN is set
12
+ - Comprehensive experiment logging
13
+ - LoRA-specific hyperparameters tracking
14
+ - Optional dataset pushing to Hugging Face Hub
15
+
16
+ Authentication:
17
+ Set HF_TOKEN environment variable to enable automatic space creation:
18
+ Linux/Mac: export HF_TOKEN=your_token_here
19
+ Windows: set HF_TOKEN=your_token_here
20
+ Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
21
+
22
+ Get your token from: https://huggingface.co/settings/tokens
23
+ """
24
 
25
  import argparse
26
  import json
27
  from pathlib import Path
28
+ from datetime import datetime
29
+ from typing import Tuple, Optional
30
  import torch
31
  from datasets import load_dataset, Audio, Dataset
32
  from transformers import (
 
36
  TrainingArguments,
37
  )
38
  from peft import LoraConfig, get_peft_model
39
+ from huggingface_hub import HfApi
40
+ import trackio
41
+
42
+
43
+ def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
44
+ """
45
+ Validate a Hugging Face token and return the username.
46
+
47
+ Args:
48
+ token (str): The Hugging Face token to validate
49
+
50
+ Returns:
51
+ Tuple[bool, Optional[str], Optional[str]]:
52
+ - success: True if token is valid, False otherwise
53
+ - username: The username associated with the token (if valid)
54
+ - error_message: Error message if validation failed
55
+ """
56
+ try:
57
+ # Create API client with token directly
58
+ api = HfApi(token=token)
59
+
60
+ # Try to get user info - this will fail if token is invalid
61
+ user_info = api.whoami()
62
+
63
+ # Extract username from user info
64
+ username = user_info.get("name", user_info.get("username"))
65
+
66
+ if not username:
67
+ return False, None, "Could not retrieve username from token"
68
+
69
+ return True, username, None
70
+
71
+ except Exception as e:
72
+ error_msg = str(e)
73
+ if "401" in error_msg or "unauthorized" in error_msg.lower():
74
+ return False, None, "Invalid token - unauthorized access"
75
+ elif "403" in error_msg:
76
+ return False, None, "Token lacks required permissions"
77
+ elif "network" in error_msg.lower() or "connection" in error_msg.lower():
78
+ return False, None, f"Network error: {error_msg}"
79
+ else:
80
+ return False, None, f"Validation error: {error_msg}"
81
+
82
+
83
+ def get_default_space_name(project_type: str = "voxtral-lora-finetuning") -> str:
84
+ """
85
+ Generate a default space name with username and timestamp.
86
+
87
+ Args:
88
+ project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
89
+
90
+ Returns:
91
+ str: Default space name in format "username/project-type-timestamp"
92
+ """
93
+ try:
94
+ # Get token from environment variables only
95
+ import os
96
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
97
+
98
+ if not token:
99
+ print("Warning: No HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variable found.")
100
+ print("Set HF_TOKEN environment variable to enable automatic space creation.")
101
+ print("Example: export HF_TOKEN=your_token_here")
102
+ print("Falling back to local-only mode.")
103
+ return None
104
+
105
+ # Validate token and get username
106
+ success, username, error = validate_hf_token(token)
107
+ if success and username:
108
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
109
+ return f"{username}/{project_type}-{timestamp}"
110
+ else:
111
+ print(f"Warning: Token validation failed: {error}")
112
+ print("Falling back to local-only mode.")
113
+ return None
114
+
115
+ except Exception as e:
116
+ print(f"Warning: Failed to generate default space name: {e}")
117
+ return None
118
 
119
 
120
  class VoxtralDataCollator:
 
266
  parser.add_argument("--lora-alpha", type=int, default=32)
267
  parser.add_argument("--lora-dropout", type=float, default=0.0)
268
  parser.add_argument("--freeze-audio-tower", action="store_true", help="Freeze audio encoder parameters")
269
+ parser.add_argument("--trackio-space", type=str, default=None,
270
+ help="Hugging Face Space ID for trackio logging (format: username/space-name). If not provided, will auto-generate based on HF token")
271
+ parser.add_argument("--push-dataset", action="store_true",
272
+ help="Push the training dataset to Hugging Face Hub after training")
273
+ parser.add_argument("--dataset-repo", type=str, default=None,
274
+ help="Dataset repository name for pushing dataset (format: username/dataset-name)")
275
  args = parser.parse_args()
276
 
277
  model_checkpoint = args.model_checkpoint
 
280
  torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
281
  print(f"Using device: {torch_device}")
282
 
283
+ # Determine trackio space
284
+ trackio_space = args.trackio_space
285
+ if not trackio_space:
286
+ trackio_space = get_default_space_name("voxtral-lora-finetuning")
287
+
288
+ # Initialize trackio for experiment tracking
289
+ if trackio_space:
290
+ print(f"Initializing trackio with space: {trackio_space}")
291
+ trackio.init(
292
+ project="voxtral-lora-finetuning",
293
+ config={
294
+ "model_checkpoint": model_checkpoint,
295
+ "output_dir": output_dir,
296
+ "batch_size": args.batch_size,
297
+ "learning_rate": args.learning_rate,
298
+ "epochs": args.epochs,
299
+ "train_count": args.train_count,
300
+ "eval_count": args.eval_count,
301
+ "dataset_jsonl": args.dataset_jsonl,
302
+ "dataset_name": args.dataset_name,
303
+ "dataset_config": args.dataset_config,
304
+ "lora_r": args.lora_r,
305
+ "lora_alpha": args.lora_alpha,
306
+ "lora_dropout": args.lora_dropout,
307
+ "freeze_audio_tower": args.freeze_audio_tower,
308
+ },
309
+ space_id=trackio_space
310
+ )
311
+ else:
312
+ print("Initializing trackio in local-only mode")
313
+ trackio.init(
314
+ project="voxtral-lora-finetuning",
315
+ config={
316
+ "model_checkpoint": model_checkpoint,
317
+ "output_dir": output_dir,
318
+ "batch_size": args.batch_size,
319
+ "learning_rate": args.learning_rate,
320
+ "epochs": args.epochs,
321
+ "train_count": args.train_count,
322
+ "eval_count": args.eval_count,
323
+ "dataset_jsonl": args.dataset_jsonl,
324
+ "dataset_name": args.dataset_name,
325
+ "dataset_config": args.dataset_config,
326
+ "lora_r": args.lora_r,
327
+ "lora_alpha": args.lora_alpha,
328
+ "lora_dropout": args.lora_dropout,
329
+ "freeze_audio_tower": args.freeze_audio_tower,
330
+ }
331
+ )
332
+
333
  print("Loading processor and model...")
334
  processor = VoxtralProcessor.from_pretrained(model_checkpoint)
335
  lora_cfg = LoraConfig(
 
369
  learning_rate=args.learning_rate,
370
  num_train_epochs=args.epochs,
371
  bf16=True,
372
+ logging_steps=args.logging_steps,
373
  eval_steps=args.save_steps if eval_dataset else None,
374
  save_steps=args.save_steps,
375
  eval_strategy="steps" if eval_dataset else "no",
376
  save_strategy="steps",
377
+ report_to=["trackio"],
378
  remove_unused_columns=False,
379
  dataloader_num_workers=1,
380
  )
 
397
  if eval_dataset:
398
  results = trainer.evaluate()
399
  print(f"Final evaluation results: {results}")
400
+ # Log final evaluation results
401
+ trackio.log(results)
402
+
403
+ # Push dataset to Hub if requested
404
+ if args.push_dataset and args.dataset_jsonl:
405
+ print("Pushing dataset to Hugging Face Hub...")
406
+ try:
407
+ from pathlib import Path
408
+ import subprocess
409
+
410
+ dataset_repo = args.dataset_repo
411
+ if not dataset_repo:
412
+ # Auto-generate dataset repo name
413
+ if trackio_space:
414
+ username = trackio_space.split('/')[0]
415
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
416
+ dataset_repo = f"{username}/voxtral-dataset-{timestamp}"
417
+ else:
418
+ print("Warning: Cannot auto-generate dataset repo name without HF token")
419
+ dataset_repo = f"voxtral-dataset-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
420
+
421
+ # Call the push script
422
+ push_cmd = [
423
+ "python", str(Path(__file__).parent / "push_to_huggingface.py"),
424
+ "dataset", args.dataset_jsonl, dataset_repo
425
+ ]
426
+
427
+ result = subprocess.run(push_cmd, capture_output=True, text=True)
428
+ if result.returncode == 0:
429
+ print(f"βœ… Dataset pushed to: https://huggingface.co/datasets/{dataset_repo}")
430
+ else:
431
+ print(f"❌ Failed to push dataset: {result.stderr}")
432
+
433
+ except Exception as e:
434
+ print(f"❌ Error pushing dataset: {e}")
435
+
436
+ # Finish trackio logging
437
+ trackio.finish()
438
 
439
  print("Training completed successfully!")
440
 
templates/datasets/readme.md DELETED
@@ -1,171 +0,0 @@
1
- ---
2
- dataset_info:
3
- features:
4
- - name: experiment_id
5
- dtype: string
6
- - name: name
7
- dtype: string
8
- - name: description
9
- dtype: string
10
- - name: created_at
11
- dtype: string
12
- - name: status
13
- dtype: string
14
- - name: metrics
15
- dtype: string
16
- - name: parameters
17
- dtype: string
18
- - name: artifacts
19
- dtype: string
20
- - name: logs
21
- dtype: string
22
- - name: last_updated
23
- dtype: string
24
- splits:
25
- - name: train
26
- num_bytes: 4945
27
- num_examples: 2
28
- download_size: 15529
29
- dataset_size: 4945
30
- configs:
31
- - config_name: default
32
- data_files:
33
- - split: train
34
- path: data/train-*
35
- tags:
36
- - track tonic
37
- - tonic
38
- - experiment tracking
39
- - smollm3
40
- - fine-tuning
41
- - legml
42
- - hermes
43
- ---
44
-
45
- # Trackio Experiments Dataset
46
-
47
- This dataset stores experiment tracking data for ML training runs, particularly focused on SmolLM3 fine-tuning experiments with comprehensive metrics tracking.
48
-
49
- ## Dataset Structure
50
-
51
- The dataset contains the following columns:
52
-
53
- - **experiment_id**: Unique identifier for each experiment
54
- - **name**: Human-readable name for the experiment
55
- - **description**: Detailed description of the experiment
56
- - **created_at**: Timestamp when the experiment was created
57
- - **status**: Current status (running, completed, failed, paused)
58
- - **metrics**: JSON string containing training metrics over time
59
- - **parameters**: JSON string containing experiment configuration
60
- - **artifacts**: JSON string containing experiment artifacts
61
- - **logs**: JSON string containing experiment logs
62
- - **last_updated**: Timestamp of last update
63
-
64
- ## Metrics Structure
65
-
66
- The metrics field contains JSON arrays with the following structure:
67
-
68
- ```json
69
- [
70
- {
71
- "timestamp": "2025-07-20T11:20:01.780908",
72
- "step": 25,
73
- "metrics": {
74
- "loss": 1.1659,
75
- "accuracy": 0.759,
76
- "learning_rate": 7e-08,
77
- "grad_norm": 10.3125,
78
- "epoch": 0.004851130919895701,
79
-
80
- // Advanced Training Metrics
81
- "total_tokens": 1642080.0,
82
- "truncated_tokens": 128,
83
- "padding_tokens": 256,
84
- "throughput": 3284160.0,
85
- "step_time": 0.5,
86
- "batch_size": 8,
87
- "seq_len": 2048,
88
- "token_acc": 0.759,
89
-
90
- // Custom Losses
91
- "train/gate_ortho": 0.0234,
92
- "train/center": 0.0156,
93
-
94
- // System Metrics
95
- "gpu_memory_allocated": 17.202261447906494,
96
- "gpu_memory_reserved": 75.474609375,
97
- "gpu_utilization": 85.2,
98
- "cpu_percent": 2.7,
99
- "memory_percent": 10.1
100
- }
101
- }
102
- ]
103
- ```
104
-
105
- ## Supported Metrics
106
-
107
- ### Core Training Metrics
108
- - **loss**: Training loss value
109
- - **accuracy**: Model accuracy
110
- - **learning_rate**: Current learning rate
111
- - **grad_norm**: Gradient norm
112
- - **epoch**: Current epoch progress
113
-
114
- ### Advanced Token Metrics
115
- - **total_tokens**: Total tokens processed in the batch
116
- - **truncated_tokens**: Number of tokens truncated during processing
117
- - **padding_tokens**: Number of padding tokens added
118
- - **throughput**: Tokens processed per second
119
- - **step_time**: Time taken for the current training step
120
- - **batch_size**: Current batch size
121
- - **seq_len**: Sequence length
122
- - **token_acc**: Token-level accuracy
123
-
124
- ### Custom Losses (SmolLM3-specific)
125
- - **train/gate_ortho**: Gate orthogonality loss
126
- - **train/center**: Center loss component
127
-
128
- ### System Performance Metrics
129
- - **gpu_memory_allocated**: GPU memory currently allocated (GB)
130
- - **gpu_memory_reserved**: GPU memory reserved (GB)
131
- - **gpu_utilization**: GPU utilization percentage
132
- - **cpu_percent**: CPU usage percentage
133
- - **memory_percent**: System memory usage percentage
134
-
135
- ## Usage
136
-
137
- This dataset is automatically used by the Trackio monitoring system to store and retrieve experiment data. It provides persistent storage for experiment tracking across different training runs.
138
-
139
- ## Integration
140
-
141
- The dataset is used by:
142
- - Trackio Spaces for experiment visualization
143
- - Training scripts for logging metrics and parameters
144
- - Monitoring systems for experiment tracking
145
- - SmolLM3 fine-tuning pipeline for comprehensive metrics capture
146
-
147
- ## Privacy
148
-
149
- This dataset is private by default to ensure experiment data security. Only users with appropriate permissions can access the data.
150
-
151
- ## Examples
152
-
153
- ### Sample Experiment Entry
154
- ```json
155
- {
156
- "experiment_id": "exp_20250720_130853",
157
- "name": "smollm3_finetune",
158
- "description": "SmolLM3 fine-tuning experiment with comprehensive metrics",
159
- "created_at": "2025-07-20T11:20:01.780908",
160
- "status": "running",
161
- "metrics": "[{\"timestamp\": \"2025-07-20T11:20:01.780908\", \"step\": 25, \"metrics\": {\"loss\": 1.1659, \"accuracy\": 0.759, \"total_tokens\": 1642080.0, \"throughput\": 3284160.0, \"train/gate_ortho\": 0.0234, \"train/center\": 0.0156}}]",
162
- "parameters": "{\"model_name\": \"HuggingFaceTB/SmolLM3-3B\", \"batch_size\": 8, \"learning_rate\": 3.5e-06, \"max_seq_length\": 12288}",
163
- "artifacts": "[]",
164
- "logs": "[]",
165
- "last_updated": "2025-07-20T11:20:01.780908"
166
- }
167
- ```
168
-
169
- ## License
170
-
171
- This dataset is part of the Trackio experiment tracking system and follows the same license as the main project.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_hf_setup.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test Hugging Face Setup for Trackio Integration
4
+
5
+ This script helps verify your Hugging Face token setup and test space name generation.
6
+ Run this before using the training scripts to ensure everything is configured correctly.
7
+
8
+ Authentication:
9
+ This script only checks for HF_TOKEN or HUGGINGFACE_HUB_TOKEN environment variables.
10
+ It does NOT use huggingface-cli login state.
11
+
12
+ Setup:
13
+ Linux/Mac: export HF_TOKEN=your_token_here
14
+ Windows: set HF_TOKEN=your_token_here
15
+ Or: export HUGGINGFACE_HUB_TOKEN=your_token_here
16
+
17
+ Get your token from: https://huggingface.co/settings/tokens
18
+ """
19
+
20
+ import os
21
+ from datetime import datetime
22
+ from typing import Tuple, Optional
23
+ from huggingface_hub import HfApi
24
+
25
+
26
+ def validate_hf_token(token: str) -> Tuple[bool, Optional[str], Optional[str]]:
27
+ """
28
+ Validate a Hugging Face token and return the username.
29
+
30
+ Args:
31
+ token (str): The Hugging Face token to validate
32
+
33
+ Returns:
34
+ Tuple[bool, Optional[str], Optional[str]]:
35
+ - success: True if token is valid, False otherwise
36
+ - username: The username associated with the token (if valid)
37
+ - error_message: Error message if validation failed
38
+ """
39
+ try:
40
+ # Create API client with token directly
41
+ api = HfApi(token=token)
42
+
43
+ # Try to get user info - this will fail if token is invalid
44
+ user_info = api.whoami()
45
+
46
+ # Extract username from user info
47
+ username = user_info.get("name", user_info.get("username"))
48
+
49
+ if not username:
50
+ return False, None, "Could not retrieve username from token"
51
+
52
+ return True, username, None
53
+
54
+ except Exception as e:
55
+ error_msg = str(e)
56
+ if "401" in error_msg or "unauthorized" in error_msg.lower():
57
+ return False, None, "Invalid token - unauthorized access"
58
+ elif "403" in error_msg:
59
+ return False, None, "Token lacks required permissions"
60
+ elif "network" in error_msg.lower() or "connection" in error_msg.lower():
61
+ return False, None, f"Network error: {error_msg}"
62
+ else:
63
+ return False, None, f"Validation error: {error_msg}"
64
+
65
+
66
+ def get_default_space_name(project_type: str = "voxtral-asr-finetuning") -> str:
67
+ """
68
+ Generate a default space name with username and timestamp.
69
+
70
+ Args:
71
+ project_type: Type of project (e.g., "voxtral-asr-finetuning", "voxtral-lora-finetuning")
72
+
73
+ Returns:
74
+ str: Default space name in format "username/project-type-timestamp"
75
+ """
76
+ try:
77
+ # Get token from environment variables only
78
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
79
+
80
+ if not token:
81
+ return None
82
+
83
+ # Validate token and get username
84
+ success, username, error = validate_hf_token(token)
85
+ if success and username:
86
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
87
+ return f"{username}/{project_type}-{timestamp}"
88
+ else:
89
+ return None
90
+
91
+ except Exception as e:
92
+ print(f"Failed to generate default space name: {e}")
93
+ return None
94
+
95
+
96
+ def main():
97
+ print("πŸ” Testing Hugging Face Setup for Trackio Integration")
98
+ print("=" * 60)
99
+
100
+ # Check for tokens
101
+ print("\n1. Checking for Hugging Face tokens...")
102
+
103
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
104
+ if token:
105
+ print(f"βœ… Found token in environment: {token[:10]}...")
106
+ else:
107
+ print("❌ No token found in environment variables")
108
+ print("\n❌ No Hugging Face token found!")
109
+ print("Please set the HF_TOKEN environment variable:")
110
+ print(" Linux/Mac: export HF_TOKEN=your_token_here")
111
+ print(" Windows: set HF_TOKEN=your_token_here")
112
+ print(" Or: set HUGGINGFACE_HUB_TOKEN=your_token_here")
113
+ print("\nGet your token from: https://huggingface.co/settings/tokens")
114
+ return
115
+
116
+ # Validate token
117
+ print("\n2. Validating token...")
118
+ success, username, error = validate_hf_token(token)
119
+
120
+ if success:
121
+ print(f"βœ… Token is valid! Username: {username}")
122
+ else:
123
+ print(f"❌ Token validation failed: {error}")
124
+ return
125
+
126
+ # Generate space names
127
+ print("\n3. Generating default space names...")
128
+
129
+ full_finetune_space = get_default_space_name("voxtral-asr-finetuning")
130
+ lora_finetune_space = get_default_space_name("voxtral-lora-finetuning")
131
+
132
+ print(f"πŸ“ Full fine-tuning space: {full_finetune_space}")
133
+ print(f"πŸ“ LoRA fine-tuning space: {lora_finetune_space}")
134
+
135
+ print("\nβœ… Setup complete! You can now run training scripts.")
136
+ print(" They will automatically use the generated space names.")
137
+ print("\nπŸ’‘ To override the auto-generated names, use --trackio-space yourname/custom-space")
138
+
139
+
140
+ if __name__ == "__main__":
141
+ main()