Spaces:
Running
Running
Joseph Pollack
attempts to replace autogenerated model card with template readme.md
708960a
unverified
| #!/usr/bin/env python3 | |
| """ | |
| Push Trained Models and Datasets to Hugging Face Hub | |
| Usage: | |
| # Push a trained model | |
| python push_to_huggingface.py model /path/to/model my-model-repo | |
| # Push a dataset | |
| python push_to_huggingface.py dataset /path/to/dataset.jsonl my-dataset-repo | |
| Authentication: | |
| Set HF_TOKEN environment variable or use --token: | |
| export HF_TOKEN=your_token_here | |
| """ | |
| import os | |
| import json | |
| import argparse | |
| import logging | |
| from pathlib import Path | |
| from typing import Dict, Any, Optional | |
| from datetime import datetime | |
| # Set timeout for HF operations to prevent hanging | |
| os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300' | |
| os.environ['HF_HUB_UPLOAD_TIMEOUT'] = '600' | |
| try: | |
| from huggingface_hub import HfApi, create_repo, upload_file | |
| HF_AVAILABLE = True | |
| except ImportError: | |
| HF_AVAILABLE = False | |
| print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub") | |
| logger = logging.getLogger(__name__) | |
| class HuggingFacePusher: | |
| """Push trained models to Hugging Face Hub""" | |
| def __init__( | |
| self, | |
| model_path: str, | |
| repo_name: str, | |
| token: Optional[str] = None, | |
| private: bool = False, | |
| author_name: Optional[str] = None, | |
| model_description: Optional[str] = None, | |
| model_name: Optional[str] = None, | |
| dataset_name: Optional[str] = None, | |
| # Optional metadata for model card generation | |
| experiment_name: Optional[str] = None, | |
| dataset_repo: Optional[str] = None, | |
| training_config_type: Optional[str] = None, | |
| trainer_type: Optional[str] = None, | |
| batch_size: Optional[str] = None, | |
| gradient_accumulation_steps: Optional[str] = None, | |
| learning_rate: Optional[str] = None, | |
| max_epochs: Optional[str] = None, | |
| max_seq_length: Optional[str] = None, | |
| trackio_url: Optional[str] = None, | |
| ): | |
| self.model_path = Path(model_path) | |
| # Original user input (may be just the repo name without username) | |
| self.repo_name = repo_name | |
| self.token = token or os.getenv('HF_TOKEN') | |
| self.private = private | |
| self.author_name = author_name | |
| self.model_description = model_description | |
| # Model card generation details | |
| self.model_name = model_name | |
| self.dataset_name = dataset_name | |
| # Optional metadata (ensure attributes always exist to avoid AttributeError) | |
| self.experiment_name = experiment_name | |
| self.dataset_repo = dataset_repo | |
| self.training_config_type = training_config_type | |
| self.trainer_type = trainer_type | |
| self.batch_size = batch_size | |
| self.gradient_accumulation_steps = gradient_accumulation_steps | |
| self.learning_rate = learning_rate | |
| self.max_epochs = max_epochs | |
| self.max_seq_length = max_seq_length | |
| self.trackio_url = trackio_url | |
| # Initialize HF API | |
| if HF_AVAILABLE: | |
| self.api = HfApi(token=self.token) | |
| else: | |
| raise ImportError("huggingface_hub is required. Install with: pip install huggingface_hub") | |
| # Resolve the full repo id (username/repo) if user only provided repo name | |
| self.repo_id = self._resolve_repo_id(self.repo_name) | |
| # Artifact type detection (full vs lora) | |
| self.artifact_type: Optional[str] = None | |
| logger.info(f"Initialized HuggingFacePusher for {self.repo_id}") | |
| def _resolve_repo_id(self, repo_name: str) -> str: | |
| """Return a fully-qualified repo id in the form username/repo. | |
| If the provided name already contains a '/', it is returned unchanged. | |
| Otherwise, we attempt to derive the username from the authenticated token | |
| or from the HF_USERNAME environment variable. | |
| """ | |
| try: | |
| if "/" in repo_name: | |
| return repo_name | |
| # Need a username. Prefer API whoami(), fallback to env HF_USERNAME | |
| username: Optional[str] = None | |
| if self.token: | |
| try: | |
| user_info = self.api.whoami() | |
| username = user_info.get("name") or user_info.get("username") | |
| except Exception: | |
| username = None | |
| if not username: | |
| username = os.getenv("HF_USERNAME") | |
| if not username: | |
| raise ValueError( | |
| "Username could not be determined. Provide a token or set HF_USERNAME, " | |
| "or pass a fully-qualified repo id 'username/repo'." | |
| ) | |
| return f"{username}/{repo_name}" | |
| except Exception as resolve_error: | |
| logger.error(f"Failed to resolve full repo id for '{repo_name}': {resolve_error}") | |
| # Fall back to provided value (may fail later at create/upload) | |
| return repo_name | |
| def create_repository(self) -> bool: | |
| """Create the Hugging Face repository""" | |
| try: | |
| logger.info(f"Creating repository: {self.repo_id}") | |
| # Create repository with timeout handling | |
| try: | |
| # Create repository | |
| create_repo( | |
| repo_id=self.repo_id, | |
| token=self.token, | |
| private=self.private, | |
| exist_ok=True | |
| ) | |
| logger.info(f"β Repository created: https://huggingface.co/{self.repo_id}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Repository creation failed: {e}") | |
| return False | |
| except Exception as e: | |
| logger.error(f"β Failed to create repository: {e}") | |
| return False | |
| def _detect_artifact_type(self) -> str: | |
| """Detect whether output dir contains a full model or a LoRA adapter.""" | |
| logger.info(f"Detecting model artifacts in: {self.model_path}") | |
| # Check if path exists | |
| if not self.model_path.exists(): | |
| logger.error(f"β Model path does not exist: {self.model_path}") | |
| return "unknown" | |
| # List all files for debugging | |
| all_files = list(self.model_path.rglob("*")) | |
| logger.info(f"π Found {len(all_files)} files in model directory") | |
| if len(all_files) <= 20: # Only show if not too many files | |
| for f in all_files: | |
| logger.info(f" - {f.relative_to(self.model_path)}") | |
| # LoRA artifacts - be more flexible about file combinations | |
| lora_config = self.model_path / "adapter_config.json" | |
| lora_weights_safetensors = self.model_path / "adapter_model.safetensors" | |
| lora_weights_bin = self.model_path / "adapter_model.bin" | |
| has_lora_config = lora_config.exists() | |
| has_lora_weights = lora_weights_safetensors.exists() or lora_weights_bin.exists() | |
| if has_lora_config: | |
| logger.info("β Found adapter_config.json") | |
| if has_lora_weights: | |
| logger.info("β Found LoRA weight files") | |
| if has_lora_config and has_lora_weights: | |
| logger.info("π― Detected LoRA adapter artifacts") | |
| return "lora" | |
| elif has_lora_config: | |
| logger.warning("β οΈ Found adapter_config.json but no weight files") | |
| elif has_lora_weights: | |
| logger.warning("β οΈ Found LoRA weight files but no adapter_config.json") | |
| # Full model artifacts - also be more flexible | |
| config_file = self.model_path / "config.json" | |
| safetensors_model = self.model_path / "model.safetensors" | |
| safetensors_index = self.model_path / "model.safetensors.index.json" | |
| pytorch_model = self.model_path / "pytorch_model.bin" | |
| has_config = config_file.exists() | |
| has_weights = (safetensors_model.exists() or safetensors_index.exists() or pytorch_model.exists()) | |
| if has_config: | |
| logger.info("β Found config.json") | |
| if has_weights: | |
| logger.info("β Found model weight files") | |
| if has_config and has_weights: | |
| logger.info("π― Detected full model artifacts") | |
| return "full" | |
| elif has_config: | |
| logger.warning("β οΈ Found config.json but no weight files") | |
| elif has_weights: | |
| logger.warning("β οΈ Found weight files but no config.json") | |
| logger.error("β Could not detect model artifacts (neither full model nor LoRA)") | |
| return "unknown" | |
| def validate_model_path(self) -> bool: | |
| """Validate that the model path contains required files for Voxtral full or LoRA.""" | |
| self.artifact_type = self._detect_artifact_type() | |
| if self.artifact_type == "unknown": | |
| logger.error("β Could not detect model type. Expected files:") | |
| logger.error(" For LoRA: adapter_config.json + adapter_model.safetensors (or .bin)") | |
| logger.error(" For Full Model: config.json + model.safetensors (or pytorch_model.bin)") | |
| logger.error(" For Voxtral ASR: also look for processor_config.json, tokenizer.json, etc.") | |
| return False | |
| if self.artifact_type == "lora": | |
| # Check for required LoRA files | |
| config_file = self.model_path / "adapter_config.json" | |
| weights_file_safetensors = self.model_path / "adapter_model.safetensors" | |
| weights_file_bin = self.model_path / "adapter_model.bin" | |
| if not config_file.exists(): | |
| logger.error("β LoRA adapter missing required file: adapter_config.json") | |
| return False | |
| if not (weights_file_safetensors.exists() or weights_file_bin.exists()): | |
| logger.error("β LoRA adapter missing weight files: adapter_model.safetensors or adapter_model.bin") | |
| return False | |
| logger.info("β LoRA adapter validation successful") | |
| logger.info(f" - Config: {config_file.name}") | |
| if weights_file_safetensors.exists(): | |
| logger.info(f" - Weights: {weights_file_safetensors.name}") | |
| elif weights_file_bin.exists(): | |
| logger.info(f" - Weights: {weights_file_bin.name}") | |
| return True | |
| if self.artifact_type == "full": | |
| # Check for required full model files | |
| config_file = self.model_path / "config.json" | |
| safetensors_file = self.model_path / "model.safetensors" | |
| safetensors_index = self.model_path / "model.safetensors.index.json" | |
| pytorch_file = self.model_path / "pytorch_model.bin" | |
| if not config_file.exists(): | |
| logger.error("β Full model missing required file: config.json") | |
| return False | |
| if not (safetensors_file.exists() or safetensors_index.exists() or pytorch_file.exists()): | |
| logger.error("β Full model missing weight files: model.safetensors, model.safetensors.index.json, or pytorch_model.bin") | |
| return False | |
| logger.info("β Full model validation successful") | |
| logger.info(f" - Config: {config_file.name}") | |
| if safetensors_file.exists(): | |
| logger.info(f" - Weights: {safetensors_file.name}") | |
| elif safetensors_index.exists(): | |
| logger.info(f" - Weights: {safetensors_index.name} (sharded)") | |
| elif pytorch_file.exists(): | |
| logger.info(f" - Weights: {pytorch_file.name}") | |
| return True | |
| return False | |
| def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str: | |
| """Create a comprehensive model card using the generate_model_card.py script""" | |
| try: | |
| # Import the model card generator | |
| import sys | |
| sys.path.append(os.path.join(os.path.dirname(__file__))) | |
| from generate_model_card import ModelCardGenerator, create_default_variables | |
| # Create generator | |
| generator = ModelCardGenerator() | |
| # Create variables for the model card | |
| variables = create_default_variables() | |
| # Determine whether dataset_name looks like a valid Hub dataset id (owner/dataset) | |
| hub_dataset = (self.dataset_name or "").strip() | |
| has_hub_dataset_id = bool(hub_dataset and "/" in hub_dataset and " " not in hub_dataset and len(hub_dataset.split("/")) == 2) | |
| # Update with actual values | |
| variables.update({ | |
| "repo_name": self.repo_id, | |
| "model_name": self.repo_id.split('/')[-1], | |
| "experiment_name": self.experiment_name or "model_push", | |
| "dataset_repo": self.dataset_repo or "", | |
| "author_name": self.author_name or "Model Author", | |
| "model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.", | |
| "training_config_type": self.training_config_type or "Custom Configuration", | |
| "base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B", | |
| "dataset_name": hub_dataset if hub_dataset else "", | |
| "has_hub_dataset_id": has_hub_dataset_id, | |
| # Only include model-index when a dataset is provided or when metrics are meaningful | |
| "include_model_index": bool(hub_dataset), | |
| "trainer_type": self.trainer_type or "SFTTrainer", | |
| "batch_size": str(self.batch_size) if self.batch_size else "8", | |
| "gradient_accumulation_steps": str(self.gradient_accumulation_steps) if self.gradient_accumulation_steps else variables.get("gradient_accumulation_steps", "16"), | |
| "learning_rate": str(self.learning_rate) if self.learning_rate else "5e-6", | |
| "max_epochs": str(self.max_epochs) if self.max_epochs else "3", | |
| "max_seq_length": str(self.max_seq_length) if self.max_seq_length else "2048", | |
| "hardware_info": self._get_hardware_info(), | |
| "trackio_url": self.trackio_url or "N/A", | |
| "training_loss": str(results.get('train_loss', 'N/A')), | |
| "validation_loss": str(results.get('eval_loss', 'N/A')), | |
| "perplexity": str(results.get('perplexity', 'N/A')), | |
| "quantized_models": False # Set to True if quantized models are available | |
| }) | |
| # Generate the model card | |
| model_card_content = generator.generate_model_card(variables) | |
| logger.info("β Model card generated using generate_model_card.py") | |
| return model_card_content | |
| except Exception as e: | |
| logger.error(f"β Failed to generate model card with generator: {e}") | |
| logger.info("π Falling back to simple model card") | |
| return self._create_simple_model_card(training_config, results) | |
| def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str: | |
| """Create a simple model card tailored for Voxtral ASR (supports full and LoRA).""" | |
| tags = ["voxtral", "asr", "speech-to-text", "fine-tuning"] | |
| if self.artifact_type == "lora": | |
| tags.append("lora") | |
| front_matter = { | |
| "license": "apache-2.0", | |
| "tags": tags, | |
| "pipeline_tag": "automatic-speech-recognition", | |
| } | |
| fm_yaml = "---\n" + "\n".join([ | |
| "license: apache-2.0", | |
| "tags:", | |
| ]) + "\n" + "\n".join([f"- {t}" for t in tags]) + "\n" + "pipeline_tag: automatic-speech-recognition\n---\n\n" | |
| model_title = self.repo_id.split('/')[-1] | |
| body = [ | |
| f"# {model_title}", | |
| "", | |
| ("This repository contains a LoRA adapter for Voxtral ASR. " | |
| "Merge the adapter with the base model or load via PEFT for inference." if self.artifact_type == "lora" else | |
| "This repository contains a fine-tuned Voxtral ASR model."), | |
| "", | |
| "## Usage", | |
| "", | |
| ("```python\nfrom transformers import AutoProcessor\nfrom peft import PeftModel\nfrom transformers import AutoModelForSeq2SeqLM\n\nbase_model_id = 'mistralai/Voxtral-Mini-3B-2507'\nprocessor = AutoProcessor.from_pretrained(base_model_id)\nbase_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)\nmodel = PeftModel.from_pretrained(base_model, '{self.repo_id}')\n```" if self.artifact_type == "lora" else | |
| f"""```python | |
| from transformers import AutoProcessor, AutoModelForSeq2SeqLM | |
| processor = AutoProcessor.from_pretrained("{self.repo_id}") | |
| model = AutoModelForSeq2SeqLM.from_pretrained("{self.repo_id}") | |
| ```"""), | |
| "", | |
| "## Training Configuration", | |
| "", | |
| f"```json\n{json.dumps(training_config or {}, indent=2)}\n```", | |
| "", | |
| "## Training Results", | |
| "", | |
| f"```json\n{json.dumps(results or {}, indent=2)}\n```", | |
| "", | |
| f"**Hardware**: {self._get_hardware_info()}", | |
| ] | |
| return fm_yaml + "\n".join(body) | |
| def _get_model_size(self) -> float: | |
| """Get model size in GB""" | |
| try: | |
| total_size = 0 | |
| for file in self.model_path.rglob("*"): | |
| if file.is_file(): | |
| total_size += file.stat().st_size | |
| return total_size / (1024**3) # Convert to GB | |
| except: | |
| return 0.0 | |
| def _get_hardware_info(self) -> str: | |
| """Get hardware information""" | |
| try: | |
| import torch | |
| if torch.cuda.is_available(): | |
| gpu_name = torch.cuda.get_device_name(0) | |
| return f"GPU: {gpu_name}" | |
| else: | |
| return "CPU" | |
| except: | |
| return "Unknown" | |
| def upload_model_files(self) -> bool: | |
| """Upload model files to Hugging Face Hub with timeout protection""" | |
| try: | |
| logger.info("Uploading model files...") | |
| # Upload all files in the model directory | |
| for file_path in self.model_path.rglob("*"): | |
| if file_path.is_file(): | |
| relative_path = file_path.relative_to(self.model_path) | |
| remote_path = str(relative_path) | |
| logger.info(f"Uploading {relative_path}") | |
| try: | |
| upload_file( | |
| path_or_fileobj=str(file_path), | |
| path_in_repo=remote_path, | |
| repo_id=self.repo_id, | |
| token=self.token | |
| ) | |
| logger.info(f"β Uploaded {relative_path}") | |
| except Exception as e: | |
| logger.error(f"β Failed to upload {relative_path}: {e}") | |
| return False | |
| logger.info("β Model files uploaded successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Failed to upload model files: {e}") | |
| return False | |
| def upload_training_results(self, results_path: str) -> bool: | |
| """Upload training results and logs""" | |
| try: | |
| logger.info("Uploading training results...") | |
| results_files = [ | |
| "train_results.json", | |
| "eval_results.json", | |
| "training_config.json", | |
| "training.log" | |
| ] | |
| for file_name in results_files: | |
| file_path = Path(results_path) / file_name | |
| if file_path.exists(): | |
| logger.info(f"Uploading {file_name}") | |
| upload_file( | |
| path_or_fileobj=str(file_path), | |
| path_in_repo=f"training_results/{file_name}", | |
| repo_id=self.repo_id, | |
| token=self.token | |
| ) | |
| logger.info("β Training results uploaded successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Failed to upload training results: {e}") | |
| return False | |
| def create_readme(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> bool: | |
| """Create and upload README.md""" | |
| try: | |
| logger.info("Creating README.md...") | |
| readme_content = f"""# {self.repo_id.split('/')[-1]} | |
| A fine-tuned SmolLM3 model for text generation tasks. | |
| ## Quick Start | |
| ```python | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| model = AutoModelForCausalLM.from_pretrained("{self.repo_id}") | |
| tokenizer = AutoTokenizer.from_pretrained("{self.repo_id}") | |
| # Generate text | |
| text = "Hello, how are you?" | |
| inputs = tokenizer(text, return_tensors="pt") | |
| outputs = model.generate(**inputs, max_new_tokens=100) | |
| print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
| ``` | |
| ## Model Information | |
| - **Base Model**: HuggingFaceTB/SmolLM3-3B | |
| - **Fine-tuning Date**: {datetime.now().strftime('%Y-%m-%d')} | |
| - **Model Size**: {self._get_model_size():.1f} GB | |
| - **Training Steps**: {results.get('total_steps', 'Unknown')} | |
| - **Final Loss**: {results.get('final_loss', 'Unknown')} | |
| - **Dataset Repository**: {self.dataset_repo} | |
| ## Training Configuration | |
| ```json | |
| {json.dumps(training_config, indent=2)} | |
| ``` | |
| ## Performance Metrics | |
| ```json | |
| {json.dumps(results, indent=2)} | |
| ``` | |
| ## Experiment Tracking | |
| Training metrics and configuration are stored in the HF Dataset repository: `{self.dataset_repo}` | |
| ## Files | |
| - `model.safetensors.index.json`: Model weights (safetensors format) | |
| - `config.json`: Model configuration | |
| - `tokenizer.json`: Tokenizer configuration | |
| - `training_results/`: Training logs and results | |
| ## License | |
| MIT License | |
| """ | |
| # Write README to temporary file | |
| readme_path = Path("temp_readme.md") | |
| with open(readme_path, "w") as f: | |
| f.write(readme_content) | |
| # Upload README | |
| upload_file( | |
| path_or_fileobj=str(readme_path), | |
| path_in_repo="README.md", | |
| token=self.token, | |
| repo_id=self.repo_id | |
| ) | |
| # Clean up | |
| readme_path.unlink() | |
| logger.info("β README.md uploaded successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Failed to create README: {e}") | |
| return False | |
| def push_model(self, training_config: Optional[Dict[str, Any]] = None, | |
| results: Optional[Dict[str, Any]] = None) -> bool: | |
| """Complete model push process""" | |
| logger.info(f"π Starting model push to {self.repo_id}") | |
| logger.info(f"π Model path: {self.model_path}") | |
| logger.info(f"π― Repository: {self.repo_id}") | |
| # Validate model path | |
| if not self.validate_model_path(): | |
| logger.error("β Model validation failed. Please check:") | |
| logger.error(" 1. The model path exists and contains the expected files") | |
| logger.error(" 2. For LoRA models: adapter_config.json and adapter_model.* files") | |
| logger.error(" 3. For full models: config.json and model weight files") | |
| logger.error(" 4. Make sure the training completed successfully and saved the model") | |
| return False | |
| # Create repository | |
| if not self.create_repository(): | |
| return False | |
| # Load training config and results if not provided | |
| if training_config is None: | |
| training_config = self._load_training_config() | |
| if results is None: | |
| results = self._load_training_results() | |
| # Create model card and persist it inside the model directory as README.md | |
| model_card = self.create_model_card(training_config, results) | |
| local_readme_path = self.model_path / "README.md" | |
| try: | |
| with open(local_readme_path, "w", encoding="utf-8") as f: | |
| f.write(model_card) | |
| except Exception as e: | |
| logger.warning(f"β οΈ Could not write README.md to model directory: {e}") | |
| # Upload README.md from the model directory | |
| upload_file( | |
| path_or_fileobj=str(local_readme_path) if local_readme_path.exists() else model_card, | |
| path_in_repo="README.md", | |
| repo_id=self.repo_id, | |
| token=self.token | |
| ) | |
| # Upload model files | |
| if not self.upload_model_files(): | |
| return False | |
| # Upload training results | |
| if results: | |
| self.upload_training_results(str(self.model_path)) | |
| # Log success | |
| logger.info(f"β Model successfully pushed to {self.repo_id}") | |
| logger.info(f"π Model successfully pushed to: https://huggingface.co/{self.repo_id}") | |
| return True | |
| def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool: | |
| """Push dataset to Hugging Face Hub including audio files""" | |
| logger.info(f"π Starting dataset push to {dataset_repo_name}") | |
| try: | |
| from huggingface_hub import create_repo, upload_file | |
| import json | |
| # Determine full dataset repo name | |
| if "/" not in dataset_repo_name: | |
| dataset_repo_name = f"{self.repo_id.split('/')[0]}/{dataset_repo_name}" | |
| # Create dataset repository | |
| try: | |
| create_repo(dataset_repo_name, repo_type="dataset", token=self.token, exist_ok=True) | |
| logger.info(f"β Created dataset repository: {dataset_repo_name}") | |
| except Exception as e: | |
| if "already exists" not in str(e).lower(): | |
| logger.error(f"β Failed to create dataset repo: {e}") | |
| return False | |
| logger.info(f"π Dataset repository already exists: {dataset_repo_name}") | |
| # Read the dataset file | |
| dataset_file = Path(dataset_path) | |
| if not dataset_file.exists(): | |
| logger.error(f"β Dataset file not found: {dataset_path}") | |
| return False | |
| # Read and process the JSONL to collect audio files and update paths | |
| audio_files = [] | |
| updated_rows = [] | |
| total_audio_size = 0 | |
| with open(dataset_file, 'r', encoding='utf-8') as f: | |
| for line_num, line in enumerate(f): | |
| try: | |
| row = json.loads(line.strip()) | |
| audio_path = row.get("audio_path", "") | |
| if audio_path: | |
| audio_file = Path(audio_path) | |
| if audio_file.exists(): | |
| # Store the original file for upload | |
| audio_files.append(audio_file) | |
| total_audio_size += audio_file.stat().st_size | |
| # Update path to be relative for the dataset | |
| row["audio_path"] = f"audio/{audio_file.name}" | |
| else: | |
| logger.warning(f"Audio file not found: {audio_path}") | |
| row["audio_path"] = "" # Clear missing files | |
| updated_rows.append(row) | |
| except json.JSONDecodeError as e: | |
| logger.warning(f"Invalid JSON on line {line_num + 1}: {e}") | |
| continue | |
| # Create updated JSONL with relative paths | |
| temp_jsonl_path = dataset_file.parent / "temp_data.jsonl" | |
| with open(temp_jsonl_path, "w", encoding="utf-8") as f: | |
| for row in updated_rows: | |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| # Upload the updated JSONL file | |
| upload_file( | |
| path_or_fileobj=str(temp_jsonl_path), | |
| path_in_repo="data.jsonl", | |
| repo_id=dataset_repo_name, | |
| repo_type="dataset", | |
| token=self.token | |
| ) | |
| logger.info(f"β Uploaded dataset file: {dataset_file.name}") | |
| # Clean up temp file | |
| temp_jsonl_path.unlink() | |
| # Upload audio files | |
| uploaded_count = 0 | |
| for audio_file in audio_files: | |
| try: | |
| remote_path = f"audio/{audio_file.name}" | |
| upload_file( | |
| path_or_fileobj=str(audio_file), | |
| path_in_repo=remote_path, | |
| repo_id=dataset_repo_name, | |
| repo_type="dataset", | |
| token=self.token | |
| ) | |
| uploaded_count += 1 | |
| logger.info(f"β Uploaded audio file: {audio_file.name}") | |
| except Exception as e: | |
| logger.error(f"β Failed to upload {audio_file.name}: {e}") | |
| # Calculate total dataset size | |
| total_dataset_size = dataset_file.stat().st_size + total_audio_size | |
| # Create a comprehensive dataset README | |
| readme_content = f"""--- | |
| dataset_info: | |
| features: | |
| - name: audio_path | |
| dtype: string | |
| - name: text | |
| dtype: string | |
| splits: | |
| - name: train | |
| num_bytes: {dataset_file.stat().st_size} | |
| num_examples: {len(updated_rows)} | |
| download_size: {total_dataset_size} | |
| dataset_size: {total_dataset_size} | |
| tags: | |
| - voxtral | |
| - asr | |
| - speech-to-text | |
| - fine-tuning | |
| - audio-dataset | |
| - tonic | |
| --- | |
| # Voxtral ASR Dataset | |
| This dataset was created for fine-tuning Voxtral ASR models. | |
| ## Dataset Structure | |
| - **audio_path**: Relative path to the audio file (stored in `audio/` directory) | |
| - **text**: Transcription of the audio | |
| ## Dataset Statistics | |
| - **Number of examples**: {len(updated_rows)} | |
| - **Audio files uploaded**: {uploaded_count} | |
| - **Total dataset size**: {total_dataset_size:,} bytes | |
| ## Usage | |
| ```python | |
| from datasets import load_dataset, Audio | |
| # Load dataset | |
| dataset = load_dataset("{dataset_repo_name}") | |
| # Load audio data | |
| dataset = dataset.cast_column("audio_path", Audio()) | |
| # Access first example | |
| print(dataset[0]["text"]) | |
| print(dataset[0]["audio_path"]) | |
| ``` | |
| ## Loading with Audio Decoding | |
| ```python | |
| from datasets import load_dataset, Audio | |
| # Load with automatic audio decoding | |
| dataset = load_dataset("{dataset_repo_name}") | |
| dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000)) | |
| # The audio column will contain the decoded audio arrays | |
| audio_array = dataset[0]["audio_path"]["array"] | |
| sampling_rate = dataset[0]["audio_path"]["sampling_rate"] | |
| ``` | |
| ## Dataset Features | |
| This dataset contains audio files with corresponding transcriptions for Voxtral ASR model fine-tuning. | |
| All audio files are stored in the `audio/` directory and referenced using relative paths in the dataset. | |
| ## License | |
| This dataset is created for research and educational purposes. | |
| """ | |
| # Upload README | |
| readme_path = dataset_file.parent / "README.md" | |
| with open(readme_path, "w", encoding="utf-8") as f: | |
| f.write(readme_content) | |
| upload_file( | |
| path_or_fileobj=str(readme_path), | |
| path_in_repo="README.md", | |
| repo_id=dataset_repo_name, | |
| repo_type="dataset", | |
| token=self.token | |
| ) | |
| readme_path.unlink() # Clean up temp file | |
| logger.info(f"β Dataset README uploaded") | |
| logger.info(f"π Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}") | |
| logger.info(f"π Uploaded {len(updated_rows)} examples and {uploaded_count} audio files") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Failed to push dataset: {e}") | |
| return False | |
| def test_dataset_push(self, dataset_path: str) -> bool: | |
| """Test dataset validation without uploading to Hugging Face Hub""" | |
| logger.info(f"π§ͺ Testing dataset validation for {dataset_path}") | |
| try: | |
| # Read the dataset file | |
| dataset_file = Path(dataset_path) | |
| if not dataset_file.exists(): | |
| logger.error(f"β Dataset file not found: {dataset_path}") | |
| return False | |
| # Read and process the JSONL to validate audio files | |
| audio_files = [] | |
| updated_rows = [] | |
| total_audio_size = 0 | |
| missing_files = [] | |
| invalid_json_lines = [] | |
| with open(dataset_file, 'r', encoding='utf-8') as f: | |
| for line_num, line in enumerate(f): | |
| try: | |
| row = json.loads(line.strip()) | |
| audio_path = row.get("audio_path", "") | |
| if audio_path: | |
| audio_file = Path(audio_path) | |
| if audio_file.exists(): | |
| # Store the file info for validation | |
| audio_files.append(audio_file) | |
| total_audio_size += audio_file.stat().st_size | |
| else: | |
| missing_files.append(str(audio_path)) | |
| updated_rows.append(row) | |
| except json.JSONDecodeError as e: | |
| invalid_json_lines.append(f"Line {line_num + 1}: {e}") | |
| continue | |
| # Report validation results | |
| logger.info("π Dataset Validation Results:") | |
| logger.info(f" - Total examples: {len(updated_rows)}") | |
| logger.info(f" - Valid audio files: {len(audio_files)}") | |
| logger.info(f" - Total audio size: {total_audio_size:,} bytes") | |
| logger.info(f" - Missing audio files: {len(missing_files)}") | |
| logger.info(f" - Invalid JSON lines: {len(invalid_json_lines)}") | |
| if missing_files: | |
| logger.warning("β οΈ Missing audio files:") | |
| for missing in missing_files[:5]: # Show first 5 | |
| logger.warning(f" - {missing}") | |
| if len(missing_files) > 5: | |
| logger.warning(f" ... and {len(missing_files) - 5} more") | |
| if invalid_json_lines: | |
| logger.warning("β οΈ Invalid JSON lines:") | |
| for invalid in invalid_json_lines[:3]: # Show first 3 | |
| logger.warning(f" - {invalid}") | |
| if len(invalid_json_lines) > 3: | |
| logger.warning(f" ... and {len(invalid_json_lines) - 3} more") | |
| # Show sample of how paths will be converted | |
| if audio_files: | |
| logger.info("π Path conversion preview:") | |
| for audio_file in audio_files[:3]: # Show first 3 | |
| logger.info(f" - {str(audio_file)} β audio/{audio_file.name}") | |
| # Overall validation status | |
| if len(updated_rows) == 0: | |
| logger.error("β No valid examples found in dataset") | |
| return False | |
| if len(missing_files) > 0: | |
| logger.warning("β οΈ Some audio files are missing - they will be skipped during upload") | |
| else: | |
| logger.info("β All audio files found and valid") | |
| logger.info("β Dataset validation completed successfully!") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Failed to validate dataset: {e}") | |
| return False | |
| def _load_training_config(self) -> Dict[str, Any]: | |
| """Load training configuration""" | |
| config_path = self.model_path / "training_config.json" | |
| if config_path.exists(): | |
| with open(config_path, "r") as f: | |
| return json.load(f) | |
| return {"model_name": "HuggingFaceTB/SmolLM3-3B"} | |
| def _load_training_results(self) -> Dict[str, Any]: | |
| """Load training results""" | |
| results_path = self.model_path / "train_results.json" | |
| if results_path.exists(): | |
| with open(results_path, "r") as f: | |
| return json.load(f) | |
| return {"final_loss": "Unknown", "total_steps": "Unknown"} | |
| def parse_args(): | |
| """Parse command line arguments""" | |
| parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub') | |
| # Subcommands | |
| subparsers = parser.add_subparsers(dest='command', help='Available commands') | |
| # Model push subcommand | |
| model_parser = subparsers.add_parser('model', help='Push trained model to Hugging Face Hub') | |
| model_parser.add_argument('model_path', type=str, help='Path to trained model directory') | |
| model_parser.add_argument('repo_name', type=str, help='Hugging Face repository name (repo-name). Username will be auto-detected from your token.') | |
| model_parser.add_argument('--token', type=str, default=None, help='Hugging Face token') | |
| model_parser.add_argument('--private', action='store_true', help='Make repository private') | |
| model_parser.add_argument('--author-name', type=str, default=None, help='Author name for model card') | |
| model_parser.add_argument('--model-description', type=str, default=None, help='Model description for model card') | |
| model_parser.add_argument('--model-name', type=str, default=None, help='Base model name') | |
| model_parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name') | |
| # Optional model card metadata | |
| model_parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for model card') | |
| model_parser.add_argument('--dataset-repo', type=str, default=None, help='Dataset repo for model card') | |
| model_parser.add_argument('--training-config-type', type=str, default=None, help='Training config type for model card') | |
| model_parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type for model card') | |
| model_parser.add_argument('--batch-size', type=str, default=None, help='Batch size for model card') | |
| model_parser.add_argument('--gradient-accumulation-steps', type=str, default=None, help='Grad accum steps for model card') | |
| model_parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate for model card') | |
| model_parser.add_argument('--max-epochs', type=str, default=None, help='Max epochs for model card') | |
| model_parser.add_argument('--max-seq-length', type=str, default=None, help='Max seq length for model card') | |
| model_parser.add_argument('--trackio-url', type=str, default=None, help='Trackio URL for model card') | |
| # Dataset push subcommand | |
| dataset_parser = subparsers.add_parser('dataset', help='Push dataset to Hugging Face Hub') | |
| dataset_parser.add_argument('dataset_path', type=str, help='Path to dataset JSONL file') | |
| dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name') | |
| dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token') | |
| dataset_parser.add_argument('--private', action='store_true', help='Make repository private') | |
| dataset_parser.add_argument('--test', action='store_true', help='Test mode - validate dataset without uploading') | |
| return parser.parse_args() | |
| def main(): | |
| """Main function""" | |
| args = parse_args() | |
| # Setup logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| if not args.command: | |
| logger.error("β No command specified. Use 'model' or 'dataset' subcommand.") | |
| return 1 | |
| try: | |
| if args.command == 'model': | |
| logger.info("Starting model push to Hugging Face Hub") | |
| # Initialize pusher | |
| pusher = HuggingFacePusher( | |
| model_path=args.model_path, | |
| repo_name=args.repo_name, | |
| token=args.token, | |
| private=args.private, | |
| author_name=args.author_name, | |
| model_description=args.model_description, | |
| model_name=args.model_name, | |
| dataset_name=args.dataset_name, | |
| experiment_name=args.experiment_name, | |
| dataset_repo=args.dataset_repo, | |
| training_config_type=args.training_config_type, | |
| trainer_type=args.trainer_type, | |
| batch_size=args.batch_size, | |
| gradient_accumulation_steps=args.gradient_accumulation_steps, | |
| learning_rate=args.learning_rate, | |
| max_epochs=args.max_epochs, | |
| max_seq_length=args.max_seq_length, | |
| trackio_url=args.trackio_url, | |
| ) | |
| # Push model | |
| success = pusher.push_model() | |
| if success: | |
| logger.info("β Model push completed successfully!") | |
| logger.info(f"π View your model at: https://huggingface.co/{args.repo_name}") | |
| else: | |
| logger.error("β Model push failed!") | |
| return 1 | |
| elif args.command == 'dataset': | |
| logger.info("Starting dataset push to Hugging Face Hub") | |
| # Initialize pusher for dataset | |
| pusher = HuggingFacePusher( | |
| model_path="", # Not needed for dataset push | |
| repo_name=args.repo_name, | |
| token=args.token, | |
| private=args.private | |
| ) | |
| if getattr(args, 'test', False): | |
| # Test mode - validate dataset without uploading | |
| success = pusher.test_dataset_push(args.dataset_path) | |
| if success: | |
| logger.info("β Dataset validation completed successfully!") | |
| else: | |
| logger.error("β Dataset validation failed!") | |
| return 1 | |
| else: | |
| # Push dataset | |
| success = pusher.push_dataset(args.dataset_path, args.repo_name) | |
| if success: | |
| logger.info("β Dataset push completed successfully!") | |
| logger.info(f"π View your dataset at: https://huggingface.co/datasets/{args.repo_name}") | |
| else: | |
| logger.error("β Dataset push failed!") | |
| return 1 | |
| except Exception as e: | |
| logger.error(f"β Error during push: {e}") | |
| return 1 | |
| return 0 | |
| if __name__ == "__main__": | |
| exit(main()) |