#!/usr/bin/env python3 """ Push Trained Models and Datasets to Hugging Face Hub Usage: # Push a trained model python push_to_huggingface.py model /path/to/model my-model-repo # Push a dataset python push_to_huggingface.py dataset /path/to/dataset.jsonl my-dataset-repo Authentication: Set HF_TOKEN environment variable or use --token: export HF_TOKEN=your_token_here """ import os import json import argparse import logging from pathlib import Path from typing import Dict, Any, Optional from datetime import datetime # Set timeout for HF operations to prevent hanging os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300' os.environ['HF_HUB_UPLOAD_TIMEOUT'] = '600' try: from huggingface_hub import HfApi, create_repo, upload_file HF_AVAILABLE = True except ImportError: HF_AVAILABLE = False print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub") logger = logging.getLogger(__name__) class HuggingFacePusher: """Push trained models to Hugging Face Hub""" def __init__( self, model_path: str, repo_name: str, token: Optional[str] = None, private: bool = False, author_name: Optional[str] = None, model_description: Optional[str] = None, model_name: Optional[str] = None, dataset_name: Optional[str] = None, # Optional metadata for model card generation experiment_name: Optional[str] = None, dataset_repo: Optional[str] = None, training_config_type: Optional[str] = None, trainer_type: Optional[str] = None, batch_size: Optional[str] = None, gradient_accumulation_steps: Optional[str] = None, learning_rate: Optional[str] = None, max_epochs: Optional[str] = None, max_seq_length: Optional[str] = None, trackio_url: Optional[str] = None, ): self.model_path = Path(model_path) # Original user input (may be just the repo name without username) self.repo_name = repo_name self.token = token or os.getenv('HF_TOKEN') self.private = private self.author_name = author_name self.model_description = model_description # Model card generation details self.model_name = model_name self.dataset_name = dataset_name # Optional metadata (ensure attributes always exist to avoid AttributeError) self.experiment_name = experiment_name self.dataset_repo = dataset_repo self.training_config_type = training_config_type self.trainer_type = trainer_type self.batch_size = batch_size self.gradient_accumulation_steps = gradient_accumulation_steps self.learning_rate = learning_rate self.max_epochs = max_epochs self.max_seq_length = max_seq_length self.trackio_url = trackio_url # Initialize HF API if HF_AVAILABLE: self.api = HfApi(token=self.token) else: raise ImportError("huggingface_hub is required. Install with: pip install huggingface_hub") # Resolve the full repo id (username/repo) if user only provided repo name self.repo_id = self._resolve_repo_id(self.repo_name) # Artifact type detection (full vs lora) self.artifact_type: Optional[str] = None logger.info(f"Initialized HuggingFacePusher for {self.repo_id}") def _resolve_repo_id(self, repo_name: str) -> str: """Return a fully-qualified repo id in the form username/repo. If the provided name already contains a '/', it is returned unchanged. Otherwise, we attempt to derive the username from the authenticated token or from the HF_USERNAME environment variable. """ try: if "/" in repo_name: return repo_name # Need a username. Prefer API whoami(), fallback to env HF_USERNAME username: Optional[str] = None if self.token: try: user_info = self.api.whoami() username = user_info.get("name") or user_info.get("username") except Exception: username = None if not username: username = os.getenv("HF_USERNAME") if not username: raise ValueError( "Username could not be determined. Provide a token or set HF_USERNAME, " "or pass a fully-qualified repo id 'username/repo'." ) return f"{username}/{repo_name}" except Exception as resolve_error: logger.error(f"Failed to resolve full repo id for '{repo_name}': {resolve_error}") # Fall back to provided value (may fail later at create/upload) return repo_name def create_repository(self) -> bool: """Create the Hugging Face repository""" try: logger.info(f"Creating repository: {self.repo_id}") # Create repository with timeout handling try: # Create repository create_repo( repo_id=self.repo_id, token=self.token, private=self.private, exist_ok=True ) logger.info(f"โœ… Repository created: https://huggingface.co/{self.repo_id}") return True except Exception as e: logger.error(f"โŒ Repository creation failed: {e}") return False except Exception as e: logger.error(f"โŒ Failed to create repository: {e}") return False def _detect_artifact_type(self) -> str: """Detect whether output dir contains a full model or a LoRA adapter.""" logger.info(f"Detecting model artifacts in: {self.model_path}") # Check if path exists if not self.model_path.exists(): logger.error(f"โŒ Model path does not exist: {self.model_path}") return "unknown" # List all files for debugging all_files = list(self.model_path.rglob("*")) logger.info(f"๐Ÿ“ Found {len(all_files)} files in model directory") if len(all_files) <= 20: # Only show if not too many files for f in all_files: logger.info(f" - {f.relative_to(self.model_path)}") # LoRA artifacts - be more flexible about file combinations lora_config = self.model_path / "adapter_config.json" lora_weights_safetensors = self.model_path / "adapter_model.safetensors" lora_weights_bin = self.model_path / "adapter_model.bin" has_lora_config = lora_config.exists() has_lora_weights = lora_weights_safetensors.exists() or lora_weights_bin.exists() if has_lora_config: logger.info("โœ… Found adapter_config.json") if has_lora_weights: logger.info("โœ… Found LoRA weight files") if has_lora_config and has_lora_weights: logger.info("๐ŸŽฏ Detected LoRA adapter artifacts") return "lora" elif has_lora_config: logger.warning("โš ๏ธ Found adapter_config.json but no weight files") elif has_lora_weights: logger.warning("โš ๏ธ Found LoRA weight files but no adapter_config.json") # Full model artifacts - also be more flexible config_file = self.model_path / "config.json" safetensors_model = self.model_path / "model.safetensors" safetensors_index = self.model_path / "model.safetensors.index.json" pytorch_model = self.model_path / "pytorch_model.bin" has_config = config_file.exists() has_weights = (safetensors_model.exists() or safetensors_index.exists() or pytorch_model.exists()) if has_config: logger.info("โœ… Found config.json") if has_weights: logger.info("โœ… Found model weight files") if has_config and has_weights: logger.info("๐ŸŽฏ Detected full model artifacts") return "full" elif has_config: logger.warning("โš ๏ธ Found config.json but no weight files") elif has_weights: logger.warning("โš ๏ธ Found weight files but no config.json") logger.error("โŒ Could not detect model artifacts (neither full model nor LoRA)") return "unknown" def validate_model_path(self) -> bool: """Validate that the model path contains required files for Voxtral full or LoRA.""" self.artifact_type = self._detect_artifact_type() if self.artifact_type == "unknown": logger.error("โŒ Could not detect model type. Expected files:") logger.error(" For LoRA: adapter_config.json + adapter_model.safetensors (or .bin)") logger.error(" For Full Model: config.json + model.safetensors (or pytorch_model.bin)") logger.error(" For Voxtral ASR: also look for processor_config.json, tokenizer.json, etc.") return False if self.artifact_type == "lora": # Check for required LoRA files config_file = self.model_path / "adapter_config.json" weights_file_safetensors = self.model_path / "adapter_model.safetensors" weights_file_bin = self.model_path / "adapter_model.bin" if not config_file.exists(): logger.error("โŒ LoRA adapter missing required file: adapter_config.json") return False if not (weights_file_safetensors.exists() or weights_file_bin.exists()): logger.error("โŒ LoRA adapter missing weight files: adapter_model.safetensors or adapter_model.bin") return False logger.info("โœ… LoRA adapter validation successful") logger.info(f" - Config: {config_file.name}") if weights_file_safetensors.exists(): logger.info(f" - Weights: {weights_file_safetensors.name}") elif weights_file_bin.exists(): logger.info(f" - Weights: {weights_file_bin.name}") return True if self.artifact_type == "full": # Check for required full model files config_file = self.model_path / "config.json" safetensors_file = self.model_path / "model.safetensors" safetensors_index = self.model_path / "model.safetensors.index.json" pytorch_file = self.model_path / "pytorch_model.bin" if not config_file.exists(): logger.error("โŒ Full model missing required file: config.json") return False if not (safetensors_file.exists() or safetensors_index.exists() or pytorch_file.exists()): logger.error("โŒ Full model missing weight files: model.safetensors, model.safetensors.index.json, or pytorch_model.bin") return False logger.info("โœ… Full model validation successful") logger.info(f" - Config: {config_file.name}") if safetensors_file.exists(): logger.info(f" - Weights: {safetensors_file.name}") elif safetensors_index.exists(): logger.info(f" - Weights: {safetensors_index.name} (sharded)") elif pytorch_file.exists(): logger.info(f" - Weights: {pytorch_file.name}") return True return False def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str: """Create a comprehensive model card using the generate_model_card.py script""" try: # Import the model card generator import sys sys.path.append(os.path.join(os.path.dirname(__file__))) from generate_model_card import ModelCardGenerator, create_default_variables # Create generator generator = ModelCardGenerator() # Create variables for the model card variables = create_default_variables() # Determine whether dataset_name looks like a valid Hub dataset id (owner/dataset) hub_dataset = (self.dataset_name or "").strip() has_hub_dataset_id = bool(hub_dataset and "/" in hub_dataset and " " not in hub_dataset and len(hub_dataset.split("/")) == 2) # Update with actual values variables.update({ "repo_name": self.repo_id, "model_name": self.repo_id.split('/')[-1], "experiment_name": self.experiment_name or "model_push", "dataset_repo": self.dataset_repo or "", "author_name": self.author_name or "Model Author", "model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.", "training_config_type": self.training_config_type or "Custom Configuration", "base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B", "dataset_name": hub_dataset if hub_dataset else "", "has_hub_dataset_id": has_hub_dataset_id, # Only include model-index when a dataset is provided or when metrics are meaningful "include_model_index": bool(hub_dataset), "trainer_type": self.trainer_type or "SFTTrainer", "batch_size": str(self.batch_size) if self.batch_size else "8", "gradient_accumulation_steps": str(self.gradient_accumulation_steps) if self.gradient_accumulation_steps else variables.get("gradient_accumulation_steps", "16"), "learning_rate": str(self.learning_rate) if self.learning_rate else "5e-6", "max_epochs": str(self.max_epochs) if self.max_epochs else "3", "max_seq_length": str(self.max_seq_length) if self.max_seq_length else "2048", "hardware_info": self._get_hardware_info(), "trackio_url": self.trackio_url or "N/A", "training_loss": str(results.get('train_loss', 'N/A')), "validation_loss": str(results.get('eval_loss', 'N/A')), "perplexity": str(results.get('perplexity', 'N/A')), "quantized_models": False # Set to True if quantized models are available }) # Generate the model card model_card_content = generator.generate_model_card(variables) logger.info("โœ… Model card generated using generate_model_card.py") return model_card_content except Exception as e: logger.error(f"โŒ Failed to generate model card with generator: {e}") logger.info("๐Ÿ”„ Falling back to simple model card") return self._create_simple_model_card(training_config, results) def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str: """Create a simple model card tailored for Voxtral ASR (supports full and LoRA).""" tags = ["voxtral", "asr", "speech-to-text", "fine-tuning"] if self.artifact_type == "lora": tags.append("lora") front_matter = { "license": "apache-2.0", "tags": tags, "pipeline_tag": "automatic-speech-recognition", } fm_yaml = "---\n" + "\n".join([ "license: apache-2.0", "tags:", ]) + "\n" + "\n".join([f"- {t}" for t in tags]) + "\n" + "pipeline_tag: automatic-speech-recognition\n---\n\n" model_title = self.repo_id.split('/')[-1] body = [ f"# {model_title}", "", ("This repository contains a LoRA adapter for Voxtral ASR. " "Merge the adapter with the base model or load via PEFT for inference." if self.artifact_type == "lora" else "This repository contains a fine-tuned Voxtral ASR model."), "", "## Usage", "", ("```python\nfrom transformers import AutoProcessor\nfrom peft import PeftModel\nfrom transformers import AutoModelForSeq2SeqLM\n\nbase_model_id = 'mistralai/Voxtral-Mini-3B-2507'\nprocessor = AutoProcessor.from_pretrained(base_model_id)\nbase_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)\nmodel = PeftModel.from_pretrained(base_model, '{self.repo_id}')\n```" if self.artifact_type == "lora" else f"""```python from transformers import AutoProcessor, AutoModelForSeq2SeqLM processor = AutoProcessor.from_pretrained("{self.repo_id}") model = AutoModelForSeq2SeqLM.from_pretrained("{self.repo_id}") ```"""), "", "## Training Configuration", "", f"```json\n{json.dumps(training_config or {}, indent=2)}\n```", "", "## Training Results", "", f"```json\n{json.dumps(results or {}, indent=2)}\n```", "", f"**Hardware**: {self._get_hardware_info()}", ] return fm_yaml + "\n".join(body) def _get_model_size(self) -> float: """Get model size in GB""" try: total_size = 0 for file in self.model_path.rglob("*"): if file.is_file(): total_size += file.stat().st_size return total_size / (1024**3) # Convert to GB except: return 0.0 def _get_hardware_info(self) -> str: """Get hardware information""" try: import torch if torch.cuda.is_available(): gpu_name = torch.cuda.get_device_name(0) return f"GPU: {gpu_name}" else: return "CPU" except: return "Unknown" def upload_model_files(self) -> bool: """Upload model files to Hugging Face Hub with timeout protection""" try: logger.info("Uploading model files...") # Upload all files in the model directory for file_path in self.model_path.rglob("*"): if file_path.is_file(): relative_path = file_path.relative_to(self.model_path) remote_path = str(relative_path) logger.info(f"Uploading {relative_path}") try: upload_file( path_or_fileobj=str(file_path), path_in_repo=remote_path, repo_id=self.repo_id, token=self.token ) logger.info(f"โœ… Uploaded {relative_path}") except Exception as e: logger.error(f"โŒ Failed to upload {relative_path}: {e}") return False logger.info("โœ… Model files uploaded successfully") return True except Exception as e: logger.error(f"โŒ Failed to upload model files: {e}") return False def upload_training_results(self, results_path: str) -> bool: """Upload training results and logs""" try: logger.info("Uploading training results...") results_files = [ "train_results.json", "eval_results.json", "training_config.json", "training.log" ] for file_name in results_files: file_path = Path(results_path) / file_name if file_path.exists(): logger.info(f"Uploading {file_name}") upload_file( path_or_fileobj=str(file_path), path_in_repo=f"training_results/{file_name}", repo_id=self.repo_id, token=self.token ) logger.info("โœ… Training results uploaded successfully") return True except Exception as e: logger.error(f"โŒ Failed to upload training results: {e}") return False def create_readme(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> bool: """Create and upload README.md""" try: logger.info("Creating README.md...") readme_content = f"""# {self.repo_id.split('/')[-1]} A fine-tuned SmolLM3 model for text generation tasks. ## Quick Start ```python from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained("{self.repo_id}") tokenizer = AutoTokenizer.from_pretrained("{self.repo_id}") # Generate text text = "Hello, how are you?" inputs = tokenizer(text, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ``` ## Model Information - **Base Model**: HuggingFaceTB/SmolLM3-3B - **Fine-tuning Date**: {datetime.now().strftime('%Y-%m-%d')} - **Model Size**: {self._get_model_size():.1f} GB - **Training Steps**: {results.get('total_steps', 'Unknown')} - **Final Loss**: {results.get('final_loss', 'Unknown')} - **Dataset Repository**: {self.dataset_repo} ## Training Configuration ```json {json.dumps(training_config, indent=2)} ``` ## Performance Metrics ```json {json.dumps(results, indent=2)} ``` ## Experiment Tracking Training metrics and configuration are stored in the HF Dataset repository: `{self.dataset_repo}` ## Files - `model.safetensors.index.json`: Model weights (safetensors format) - `config.json`: Model configuration - `tokenizer.json`: Tokenizer configuration - `training_results/`: Training logs and results ## License MIT License """ # Write README to temporary file readme_path = Path("temp_readme.md") with open(readme_path, "w") as f: f.write(readme_content) # Upload README upload_file( path_or_fileobj=str(readme_path), path_in_repo="README.md", token=self.token, repo_id=self.repo_id ) # Clean up readme_path.unlink() logger.info("โœ… README.md uploaded successfully") return True except Exception as e: logger.error(f"โŒ Failed to create README: {e}") return False def push_model(self, training_config: Optional[Dict[str, Any]] = None, results: Optional[Dict[str, Any]] = None) -> bool: """Complete model push process""" logger.info(f"๐Ÿš€ Starting model push to {self.repo_id}") logger.info(f"๐Ÿ“‚ Model path: {self.model_path}") logger.info(f"๐ŸŽฏ Repository: {self.repo_id}") # Validate model path if not self.validate_model_path(): logger.error("โŒ Model validation failed. Please check:") logger.error(" 1. The model path exists and contains the expected files") logger.error(" 2. For LoRA models: adapter_config.json and adapter_model.* files") logger.error(" 3. For full models: config.json and model weight files") logger.error(" 4. Make sure the training completed successfully and saved the model") return False # Create repository if not self.create_repository(): return False # Load training config and results if not provided if training_config is None: training_config = self._load_training_config() if results is None: results = self._load_training_results() # Create model card and persist it inside the model directory as README.md model_card = self.create_model_card(training_config, results) local_readme_path = self.model_path / "README.md" try: with open(local_readme_path, "w", encoding="utf-8") as f: f.write(model_card) except Exception as e: logger.warning(f"โš ๏ธ Could not write README.md to model directory: {e}") # Upload README.md from the model directory upload_file( path_or_fileobj=str(local_readme_path) if local_readme_path.exists() else model_card, path_in_repo="README.md", repo_id=self.repo_id, token=self.token ) # Upload model files if not self.upload_model_files(): return False # Upload training results if results: self.upload_training_results(str(self.model_path)) # Log success logger.info(f"โœ… Model successfully pushed to {self.repo_id}") logger.info(f"๐ŸŽ‰ Model successfully pushed to: https://huggingface.co/{self.repo_id}") return True def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool: """Push dataset to Hugging Face Hub including audio files""" logger.info(f"๐Ÿš€ Starting dataset push to {dataset_repo_name}") try: from huggingface_hub import create_repo, upload_file import json # Determine full dataset repo name if "/" not in dataset_repo_name: dataset_repo_name = f"{self.repo_id.split('/')[0]}/{dataset_repo_name}" # Create dataset repository try: create_repo(dataset_repo_name, repo_type="dataset", token=self.token, exist_ok=True) logger.info(f"โœ… Created dataset repository: {dataset_repo_name}") except Exception as e: if "already exists" not in str(e).lower(): logger.error(f"โŒ Failed to create dataset repo: {e}") return False logger.info(f"๐Ÿ“ Dataset repository already exists: {dataset_repo_name}") # Read the dataset file dataset_file = Path(dataset_path) if not dataset_file.exists(): logger.error(f"โŒ Dataset file not found: {dataset_path}") return False # Read and process the JSONL to collect audio files and update paths audio_files = [] updated_rows = [] total_audio_size = 0 with open(dataset_file, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f): try: row = json.loads(line.strip()) audio_path = row.get("audio_path", "") if audio_path: audio_file = Path(audio_path) if audio_file.exists(): # Store the original file for upload audio_files.append(audio_file) total_audio_size += audio_file.stat().st_size # Update path to be relative for the dataset row["audio_path"] = f"audio/{audio_file.name}" else: logger.warning(f"Audio file not found: {audio_path}") row["audio_path"] = "" # Clear missing files updated_rows.append(row) except json.JSONDecodeError as e: logger.warning(f"Invalid JSON on line {line_num + 1}: {e}") continue # Create updated JSONL with relative paths temp_jsonl_path = dataset_file.parent / "temp_data.jsonl" with open(temp_jsonl_path, "w", encoding="utf-8") as f: for row in updated_rows: f.write(json.dumps(row, ensure_ascii=False) + "\n") # Upload the updated JSONL file upload_file( path_or_fileobj=str(temp_jsonl_path), path_in_repo="data.jsonl", repo_id=dataset_repo_name, repo_type="dataset", token=self.token ) logger.info(f"โœ… Uploaded dataset file: {dataset_file.name}") # Clean up temp file temp_jsonl_path.unlink() # Upload audio files uploaded_count = 0 for audio_file in audio_files: try: remote_path = f"audio/{audio_file.name}" upload_file( path_or_fileobj=str(audio_file), path_in_repo=remote_path, repo_id=dataset_repo_name, repo_type="dataset", token=self.token ) uploaded_count += 1 logger.info(f"โœ… Uploaded audio file: {audio_file.name}") except Exception as e: logger.error(f"โŒ Failed to upload {audio_file.name}: {e}") # Calculate total dataset size total_dataset_size = dataset_file.stat().st_size + total_audio_size # Create a comprehensive dataset README readme_content = f"""--- dataset_info: features: - name: audio_path dtype: string - name: text dtype: string splits: - name: train num_bytes: {dataset_file.stat().st_size} num_examples: {len(updated_rows)} download_size: {total_dataset_size} dataset_size: {total_dataset_size} tags: - voxtral - asr - speech-to-text - fine-tuning - audio-dataset - tonic --- # Voxtral ASR Dataset This dataset was created for fine-tuning Voxtral ASR models. ## Dataset Structure - **audio_path**: Relative path to the audio file (stored in `audio/` directory) - **text**: Transcription of the audio ## Dataset Statistics - **Number of examples**: {len(updated_rows)} - **Audio files uploaded**: {uploaded_count} - **Total dataset size**: {total_dataset_size:,} bytes ## Usage ```python from datasets import load_dataset, Audio # Load dataset dataset = load_dataset("{dataset_repo_name}") # Load audio data dataset = dataset.cast_column("audio_path", Audio()) # Access first example print(dataset[0]["text"]) print(dataset[0]["audio_path"]) ``` ## Loading with Audio Decoding ```python from datasets import load_dataset, Audio # Load with automatic audio decoding dataset = load_dataset("{dataset_repo_name}") dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000)) # The audio column will contain the decoded audio arrays audio_array = dataset[0]["audio_path"]["array"] sampling_rate = dataset[0]["audio_path"]["sampling_rate"] ``` ## Dataset Features This dataset contains audio files with corresponding transcriptions for Voxtral ASR model fine-tuning. All audio files are stored in the `audio/` directory and referenced using relative paths in the dataset. ## License This dataset is created for research and educational purposes. """ # Upload README readme_path = dataset_file.parent / "README.md" with open(readme_path, "w", encoding="utf-8") as f: f.write(readme_content) upload_file( path_or_fileobj=str(readme_path), path_in_repo="README.md", repo_id=dataset_repo_name, repo_type="dataset", token=self.token ) readme_path.unlink() # Clean up temp file logger.info(f"โœ… Dataset README uploaded") logger.info(f"๐ŸŽ‰ Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}") logger.info(f"๐Ÿ“Š Uploaded {len(updated_rows)} examples and {uploaded_count} audio files") return True except Exception as e: logger.error(f"โŒ Failed to push dataset: {e}") return False def test_dataset_push(self, dataset_path: str) -> bool: """Test dataset validation without uploading to Hugging Face Hub""" logger.info(f"๐Ÿงช Testing dataset validation for {dataset_path}") try: # Read the dataset file dataset_file = Path(dataset_path) if not dataset_file.exists(): logger.error(f"โŒ Dataset file not found: {dataset_path}") return False # Read and process the JSONL to validate audio files audio_files = [] updated_rows = [] total_audio_size = 0 missing_files = [] invalid_json_lines = [] with open(dataset_file, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f): try: row = json.loads(line.strip()) audio_path = row.get("audio_path", "") if audio_path: audio_file = Path(audio_path) if audio_file.exists(): # Store the file info for validation audio_files.append(audio_file) total_audio_size += audio_file.stat().st_size else: missing_files.append(str(audio_path)) updated_rows.append(row) except json.JSONDecodeError as e: invalid_json_lines.append(f"Line {line_num + 1}: {e}") continue # Report validation results logger.info("๐Ÿ“Š Dataset Validation Results:") logger.info(f" - Total examples: {len(updated_rows)}") logger.info(f" - Valid audio files: {len(audio_files)}") logger.info(f" - Total audio size: {total_audio_size:,} bytes") logger.info(f" - Missing audio files: {len(missing_files)}") logger.info(f" - Invalid JSON lines: {len(invalid_json_lines)}") if missing_files: logger.warning("โš ๏ธ Missing audio files:") for missing in missing_files[:5]: # Show first 5 logger.warning(f" - {missing}") if len(missing_files) > 5: logger.warning(f" ... and {len(missing_files) - 5} more") if invalid_json_lines: logger.warning("โš ๏ธ Invalid JSON lines:") for invalid in invalid_json_lines[:3]: # Show first 3 logger.warning(f" - {invalid}") if len(invalid_json_lines) > 3: logger.warning(f" ... and {len(invalid_json_lines) - 3} more") # Show sample of how paths will be converted if audio_files: logger.info("๐Ÿ”„ Path conversion preview:") for audio_file in audio_files[:3]: # Show first 3 logger.info(f" - {str(audio_file)} โ†’ audio/{audio_file.name}") # Overall validation status if len(updated_rows) == 0: logger.error("โŒ No valid examples found in dataset") return False if len(missing_files) > 0: logger.warning("โš ๏ธ Some audio files are missing - they will be skipped during upload") else: logger.info("โœ… All audio files found and valid") logger.info("โœ… Dataset validation completed successfully!") return True except Exception as e: logger.error(f"โŒ Failed to validate dataset: {e}") return False def _load_training_config(self) -> Dict[str, Any]: """Load training configuration""" config_path = self.model_path / "training_config.json" if config_path.exists(): with open(config_path, "r") as f: return json.load(f) return {"model_name": "HuggingFaceTB/SmolLM3-3B"} def _load_training_results(self) -> Dict[str, Any]: """Load training results""" results_path = self.model_path / "train_results.json" if results_path.exists(): with open(results_path, "r") as f: return json.load(f) return {"final_loss": "Unknown", "total_steps": "Unknown"} def parse_args(): """Parse command line arguments""" parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub') # Subcommands subparsers = parser.add_subparsers(dest='command', help='Available commands') # Model push subcommand model_parser = subparsers.add_parser('model', help='Push trained model to Hugging Face Hub') model_parser.add_argument('model_path', type=str, help='Path to trained model directory') model_parser.add_argument('repo_name', type=str, help='Hugging Face repository name (repo-name). Username will be auto-detected from your token.') model_parser.add_argument('--token', type=str, default=None, help='Hugging Face token') model_parser.add_argument('--private', action='store_true', help='Make repository private') model_parser.add_argument('--author-name', type=str, default=None, help='Author name for model card') model_parser.add_argument('--model-description', type=str, default=None, help='Model description for model card') model_parser.add_argument('--model-name', type=str, default=None, help='Base model name') model_parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name') # Optional model card metadata model_parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for model card') model_parser.add_argument('--dataset-repo', type=str, default=None, help='Dataset repo for model card') model_parser.add_argument('--training-config-type', type=str, default=None, help='Training config type for model card') model_parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type for model card') model_parser.add_argument('--batch-size', type=str, default=None, help='Batch size for model card') model_parser.add_argument('--gradient-accumulation-steps', type=str, default=None, help='Grad accum steps for model card') model_parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate for model card') model_parser.add_argument('--max-epochs', type=str, default=None, help='Max epochs for model card') model_parser.add_argument('--max-seq-length', type=str, default=None, help='Max seq length for model card') model_parser.add_argument('--trackio-url', type=str, default=None, help='Trackio URL for model card') # Dataset push subcommand dataset_parser = subparsers.add_parser('dataset', help='Push dataset to Hugging Face Hub') dataset_parser.add_argument('dataset_path', type=str, help='Path to dataset JSONL file') dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name') dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token') dataset_parser.add_argument('--private', action='store_true', help='Make repository private') dataset_parser.add_argument('--test', action='store_true', help='Test mode - validate dataset without uploading') return parser.parse_args() def main(): """Main function""" args = parse_args() # Setup logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) if not args.command: logger.error("โŒ No command specified. Use 'model' or 'dataset' subcommand.") return 1 try: if args.command == 'model': logger.info("Starting model push to Hugging Face Hub") # Initialize pusher pusher = HuggingFacePusher( model_path=args.model_path, repo_name=args.repo_name, token=args.token, private=args.private, author_name=args.author_name, model_description=args.model_description, model_name=args.model_name, dataset_name=args.dataset_name, experiment_name=args.experiment_name, dataset_repo=args.dataset_repo, training_config_type=args.training_config_type, trainer_type=args.trainer_type, batch_size=args.batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, learning_rate=args.learning_rate, max_epochs=args.max_epochs, max_seq_length=args.max_seq_length, trackio_url=args.trackio_url, ) # Push model success = pusher.push_model() if success: logger.info("โœ… Model push completed successfully!") logger.info(f"๐ŸŒ View your model at: https://huggingface.co/{args.repo_name}") else: logger.error("โŒ Model push failed!") return 1 elif args.command == 'dataset': logger.info("Starting dataset push to Hugging Face Hub") # Initialize pusher for dataset pusher = HuggingFacePusher( model_path="", # Not needed for dataset push repo_name=args.repo_name, token=args.token, private=args.private ) if getattr(args, 'test', False): # Test mode - validate dataset without uploading success = pusher.test_dataset_push(args.dataset_path) if success: logger.info("โœ… Dataset validation completed successfully!") else: logger.error("โŒ Dataset validation failed!") return 1 else: # Push dataset success = pusher.push_dataset(args.dataset_path, args.repo_name) if success: logger.info("โœ… Dataset push completed successfully!") logger.info(f"๐Ÿ“Š View your dataset at: https://huggingface.co/datasets/{args.repo_name}") else: logger.error("โŒ Dataset push failed!") return 1 except Exception as e: logger.error(f"โŒ Error during push: {e}") return 1 return 0 if __name__ == "__main__": exit(main())