Spaces:

Tonic
/

VoxFactory

Running

VoxFactory / scripts /push_to_huggingface.py

Joseph Pollack

attempts to replace autogenerated model card with template readme.md

708960a unverified 2 months ago

42.9 kB

	#!/usr/bin/env python3
	"""
	Push Trained Models and Datasets to Hugging Face Hub

	Usage:
	# Push a trained model
	python push_to_huggingface.py model /path/to/model my-model-repo

	# Push a dataset
	python push_to_huggingface.py dataset /path/to/dataset.jsonl my-dataset-repo

	Authentication:
	Set HF_TOKEN environment variable or use --token:
	export HF_TOKEN=your_token_here
	"""

	import os
	import json
	import argparse
	import logging
	from pathlib import Path
	from typing import Dict, Any, Optional
	from datetime import datetime

	# Set timeout for HF operations to prevent hanging
	os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
	os.environ['HF_HUB_UPLOAD_TIMEOUT'] = '600'

	try:
	from huggingface_hub import HfApi, create_repo, upload_file
	HF_AVAILABLE = True
	except ImportError:
	HF_AVAILABLE = False
	print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")

	logger = logging.getLogger(__name__)

	class HuggingFacePusher:
	"""Push trained models to Hugging Face Hub"""

	def __init__(
	self,
	model_path: str,
	repo_name: str,
	token: Optional[str] = None,
	private: bool = False,
	author_name: Optional[str] = None,
	model_description: Optional[str] = None,
	model_name: Optional[str] = None,
	dataset_name: Optional[str] = None,
	# Optional metadata for model card generation
	experiment_name: Optional[str] = None,
	dataset_repo: Optional[str] = None,
	training_config_type: Optional[str] = None,
	trainer_type: Optional[str] = None,
	batch_size: Optional[str] = None,
	gradient_accumulation_steps: Optional[str] = None,
	learning_rate: Optional[str] = None,
	max_epochs: Optional[str] = None,
	max_seq_length: Optional[str] = None,
	trackio_url: Optional[str] = None,
	):
	self.model_path = Path(model_path)
	# Original user input (may be just the repo name without username)
	self.repo_name = repo_name
	self.token = token or os.getenv('HF_TOKEN')
	self.private = private
	self.author_name = author_name
	self.model_description = model_description

	# Model card generation details
	self.model_name = model_name
	self.dataset_name = dataset_name
	# Optional metadata (ensure attributes always exist to avoid AttributeError)
	self.experiment_name = experiment_name
	self.dataset_repo = dataset_repo
	self.training_config_type = training_config_type
	self.trainer_type = trainer_type
	self.batch_size = batch_size
	self.gradient_accumulation_steps = gradient_accumulation_steps
	self.learning_rate = learning_rate
	self.max_epochs = max_epochs
	self.max_seq_length = max_seq_length
	self.trackio_url = trackio_url

	# Initialize HF API
	if HF_AVAILABLE:
	self.api = HfApi(token=self.token)
	else:
	raise ImportError("huggingface_hub is required. Install with: pip install huggingface_hub")

	# Resolve the full repo id (username/repo) if user only provided repo name
	self.repo_id = self._resolve_repo_id(self.repo_name)
	# Artifact type detection (full vs lora)
	self.artifact_type: Optional[str] = None

	logger.info(f"Initialized HuggingFacePusher for {self.repo_id}")

	def _resolve_repo_id(self, repo_name: str) -> str:
	"""Return a fully-qualified repo id in the form username/repo.

	If the provided name already contains a '/', it is returned unchanged.
	Otherwise, we attempt to derive the username from the authenticated token
	or from the HF_USERNAME environment variable.
	"""
	try:
	if "/" in repo_name:
	return repo_name

	# Need a username. Prefer API whoami(), fallback to env HF_USERNAME
	username: Optional[str] = None
	if self.token:
	try:
	user_info = self.api.whoami()
	username = user_info.get("name") or user_info.get("username")
	except Exception:
	username = None

	if not username:
	username = os.getenv("HF_USERNAME")

	if not username:
	raise ValueError(
	"Username could not be determined. Provide a token or set HF_USERNAME, "
	"or pass a fully-qualified repo id 'username/repo'."
	)

	return f"{username}/{repo_name}"
	except Exception as resolve_error:
	logger.error(f"Failed to resolve full repo id for '{repo_name}': {resolve_error}")
	# Fall back to provided value (may fail later at create/upload)
	return repo_name

	def create_repository(self) -> bool:
	"""Create the Hugging Face repository"""
	try:
	logger.info(f"Creating repository: {self.repo_id}")

	# Create repository with timeout handling
	try:
	# Create repository
	create_repo(
	repo_id=self.repo_id,
	token=self.token,
	private=self.private,
	exist_ok=True
	)

	logger.info(f"✅ Repository created: https://huggingface.co/{self.repo_id}")
	return True

	except Exception as e:
	logger.error(f"❌ Repository creation failed: {e}")
	return False

	except Exception as e:
	logger.error(f"❌ Failed to create repository: {e}")
	return False

	def _detect_artifact_type(self) -> str:
	"""Detect whether output dir contains a full model or a LoRA adapter."""
	logger.info(f"Detecting model artifacts in: {self.model_path}")

	# Check if path exists
	if not self.model_path.exists():
	logger.error(f"❌ Model path does not exist: {self.model_path}")
	return "unknown"

	# List all files for debugging
	all_files = list(self.model_path.rglob("*"))
	logger.info(f"📁 Found {len(all_files)} files in model directory")
	if len(all_files) <= 20: # Only show if not too many files
	for f in all_files:
	logger.info(f" - {f.relative_to(self.model_path)}")

	# LoRA artifacts - be more flexible about file combinations
	lora_config = self.model_path / "adapter_config.json"
	lora_weights_safetensors = self.model_path / "adapter_model.safetensors"
	lora_weights_bin = self.model_path / "adapter_model.bin"

	has_lora_config = lora_config.exists()
	has_lora_weights = lora_weights_safetensors.exists() or lora_weights_bin.exists()

	if has_lora_config:
	logger.info("✅ Found adapter_config.json")
	if has_lora_weights:
	logger.info("✅ Found LoRA weight files")

	if has_lora_config and has_lora_weights:
	logger.info("🎯 Detected LoRA adapter artifacts")
	return "lora"
	elif has_lora_config:
	logger.warning("⚠️ Found adapter_config.json but no weight files")
	elif has_lora_weights:
	logger.warning("⚠️ Found LoRA weight files but no adapter_config.json")

	# Full model artifacts - also be more flexible
	config_file = self.model_path / "config.json"
	safetensors_model = self.model_path / "model.safetensors"
	safetensors_index = self.model_path / "model.safetensors.index.json"
	pytorch_model = self.model_path / "pytorch_model.bin"

	has_config = config_file.exists()
	has_weights = (safetensors_model.exists() or safetensors_index.exists() or pytorch_model.exists())

	if has_config:
	logger.info("✅ Found config.json")
	if has_weights:
	logger.info("✅ Found model weight files")

	if has_config and has_weights:
	logger.info("🎯 Detected full model artifacts")
	return "full"
	elif has_config:
	logger.warning("⚠️ Found config.json but no weight files")
	elif has_weights:
	logger.warning("⚠️ Found weight files but no config.json")

	logger.error("❌ Could not detect model artifacts (neither full model nor LoRA)")
	return "unknown"

	def validate_model_path(self) -> bool:
	"""Validate that the model path contains required files for Voxtral full or LoRA."""
	self.artifact_type = self._detect_artifact_type()

	if self.artifact_type == "unknown":
	logger.error("❌ Could not detect model type. Expected files:")
	logger.error(" For LoRA: adapter_config.json + adapter_model.safetensors (or .bin)")
	logger.error(" For Full Model: config.json + model.safetensors (or pytorch_model.bin)")
	logger.error(" For Voxtral ASR: also look for processor_config.json, tokenizer.json, etc.")
	return False

	if self.artifact_type == "lora":
	# Check for required LoRA files
	config_file = self.model_path / "adapter_config.json"
	weights_file_safetensors = self.model_path / "adapter_model.safetensors"
	weights_file_bin = self.model_path / "adapter_model.bin"

	if not config_file.exists():
	logger.error("❌ LoRA adapter missing required file: adapter_config.json")
	return False

	if not (weights_file_safetensors.exists() or weights_file_bin.exists()):
	logger.error("❌ LoRA adapter missing weight files: adapter_model.safetensors or adapter_model.bin")
	return False

	logger.info("✅ LoRA adapter validation successful")
	logger.info(f" - Config: {config_file.name}")
	if weights_file_safetensors.exists():
	logger.info(f" - Weights: {weights_file_safetensors.name}")
	elif weights_file_bin.exists():
	logger.info(f" - Weights: {weights_file_bin.name}")

	return True

	if self.artifact_type == "full":
	# Check for required full model files
	config_file = self.model_path / "config.json"
	safetensors_file = self.model_path / "model.safetensors"
	safetensors_index = self.model_path / "model.safetensors.index.json"
	pytorch_file = self.model_path / "pytorch_model.bin"

	if not config_file.exists():
	logger.error("❌ Full model missing required file: config.json")
	return False

	if not (safetensors_file.exists() or safetensors_index.exists() or pytorch_file.exists()):
	logger.error("❌ Full model missing weight files: model.safetensors, model.safetensors.index.json, or pytorch_model.bin")
	return False

	logger.info("✅ Full model validation successful")
	logger.info(f" - Config: {config_file.name}")
	if safetensors_file.exists():
	logger.info(f" - Weights: {safetensors_file.name}")
	elif safetensors_index.exists():
	logger.info(f" - Weights: {safetensors_index.name} (sharded)")
	elif pytorch_file.exists():
	logger.info(f" - Weights: {pytorch_file.name}")

	return True

	return False

	def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
	"""Create a comprehensive model card using the generate_model_card.py script"""
	try:
	# Import the model card generator
	import sys
	sys.path.append(os.path.join(os.path.dirname(__file__)))
	from generate_model_card import ModelCardGenerator, create_default_variables

	# Create generator
	generator = ModelCardGenerator()

	# Create variables for the model card
	variables = create_default_variables()

	# Determine whether dataset_name looks like a valid Hub dataset id (owner/dataset)
	hub_dataset = (self.dataset_name or "").strip()
	has_hub_dataset_id = bool(hub_dataset and "/" in hub_dataset and " " not in hub_dataset and len(hub_dataset.split("/")) == 2)

	# Update with actual values
	variables.update({
	"repo_name": self.repo_id,
	"model_name": self.repo_id.split('/')[-1],
	"experiment_name": self.experiment_name or "model_push",
	"dataset_repo": self.dataset_repo or "",
	"author_name": self.author_name or "Model Author",
	"model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
	"training_config_type": self.training_config_type or "Custom Configuration",
	"base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
	"dataset_name": hub_dataset if hub_dataset else "",
	"has_hub_dataset_id": has_hub_dataset_id,
	# Only include model-index when a dataset is provided or when metrics are meaningful
	"include_model_index": bool(hub_dataset),
	"trainer_type": self.trainer_type or "SFTTrainer",
	"batch_size": str(self.batch_size) if self.batch_size else "8",
	"gradient_accumulation_steps": str(self.gradient_accumulation_steps) if self.gradient_accumulation_steps else variables.get("gradient_accumulation_steps", "16"),
	"learning_rate": str(self.learning_rate) if self.learning_rate else "5e-6",
	"max_epochs": str(self.max_epochs) if self.max_epochs else "3",
	"max_seq_length": str(self.max_seq_length) if self.max_seq_length else "2048",
	"hardware_info": self._get_hardware_info(),
	"trackio_url": self.trackio_url or "N/A",
	"training_loss": str(results.get('train_loss', 'N/A')),
	"validation_loss": str(results.get('eval_loss', 'N/A')),
	"perplexity": str(results.get('perplexity', 'N/A')),
	"quantized_models": False # Set to True if quantized models are available
	})

	# Generate the model card
	model_card_content = generator.generate_model_card(variables)

	logger.info("✅ Model card generated using generate_model_card.py")
	return model_card_content

	except Exception as e:
	logger.error(f"❌ Failed to generate model card with generator: {e}")
	logger.info("🔄 Falling back to simple model card")
	return self._create_simple_model_card(training_config, results)

	def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
	"""Create a simple model card tailored for Voxtral ASR (supports full and LoRA)."""
	tags = ["voxtral", "asr", "speech-to-text", "fine-tuning"]
	if self.artifact_type == "lora":
	tags.append("lora")
	front_matter = {
	"license": "apache-2.0",
	"tags": tags,
	"pipeline_tag": "automatic-speech-recognition",
	}
	fm_yaml = "---\n" + "\n".join([
	"license: apache-2.0",
	"tags:",
	]) + "\n" + "\n".join([f"- {t}" for t in tags]) + "\n" + "pipeline_tag: automatic-speech-recognition\n---\n\n"
	model_title = self.repo_id.split('/')[-1]
	body = [
	f"# {model_title}",
	"",
	("This repository contains a LoRA adapter for Voxtral ASR. "
	"Merge the adapter with the base model or load via PEFT for inference." if self.artifact_type == "lora" else
	"This repository contains a fine-tuned Voxtral ASR model."),
	"",
	"## Usage",
	"",
	("```python\nfrom transformers import AutoProcessor\nfrom peft import PeftModel\nfrom transformers import AutoModelForSeq2SeqLM\n\nbase_model_id = 'mistralai/Voxtral-Mini-3B-2507'\nprocessor = AutoProcessor.from_pretrained(base_model_id)\nbase_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)\nmodel = PeftModel.from_pretrained(base_model, '{self.repo_id}')\n```" if self.artifact_type == "lora" else
	f"""```python
	from transformers import AutoProcessor, AutoModelForSeq2SeqLM

	processor = AutoProcessor.from_pretrained("{self.repo_id}")
	model = AutoModelForSeq2SeqLM.from_pretrained("{self.repo_id}")
	```"""),
	"",
	"## Training Configuration",
	"",
	f"```json\n{json.dumps(training_config or {}, indent=2)}\n```",
	"",
	"## Training Results",
	"",
	f"```json\n{json.dumps(results or {}, indent=2)}\n```",
	"",
	f"Hardware: {self._get_hardware_info()}",
	]
	return fm_yaml + "\n".join(body)

	def _get_model_size(self) -> float:
	"""Get model size in GB"""
	try:
	total_size = 0
	for file in self.model_path.rglob("*"):
	if file.is_file():
	total_size += file.stat().st_size
	return total_size / (1024**3) # Convert to GB
	except:
	return 0.0

	def _get_hardware_info(self) -> str:
	"""Get hardware information"""
	try:
	import torch
	if torch.cuda.is_available():
	gpu_name = torch.cuda.get_device_name(0)
	return f"GPU: {gpu_name}"
	else:
	return "CPU"
	except:
	return "Unknown"

	def upload_model_files(self) -> bool:
	"""Upload model files to Hugging Face Hub with timeout protection"""
	try:
	logger.info("Uploading model files...")

	# Upload all files in the model directory
	for file_path in self.model_path.rglob("*"):
	if file_path.is_file():
	relative_path = file_path.relative_to(self.model_path)
	remote_path = str(relative_path)

	logger.info(f"Uploading {relative_path}")

	try:
	upload_file(
	path_or_fileobj=str(file_path),
	path_in_repo=remote_path,
	repo_id=self.repo_id,
	token=self.token
	)
	logger.info(f"✅ Uploaded {relative_path}")

	except Exception as e:
	logger.error(f"❌ Failed to upload {relative_path}: {e}")
	return False

	logger.info("✅ Model files uploaded successfully")
	return True

	except Exception as e:
	logger.error(f"❌ Failed to upload model files: {e}")
	return False

	def upload_training_results(self, results_path: str) -> bool:
	"""Upload training results and logs"""
	try:
	logger.info("Uploading training results...")

	results_files = [
	"train_results.json",
	"eval_results.json",
	"training_config.json",
	"training.log"
	]

	for file_name in results_files:
	file_path = Path(results_path) / file_name
	if file_path.exists():
	logger.info(f"Uploading {file_name}")
	upload_file(
	path_or_fileobj=str(file_path),
	path_in_repo=f"training_results/{file_name}",
	repo_id=self.repo_id,
	token=self.token
	)

	logger.info("✅ Training results uploaded successfully")
	return True

	except Exception as e:
	logger.error(f"❌ Failed to upload training results: {e}")
	return False

	def create_readme(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> bool:
	"""Create and upload README.md"""
	try:
	logger.info("Creating README.md...")

	readme_content = f"""# {self.repo_id.split('/')[-1]}

	A fine-tuned SmolLM3 model for text generation tasks.

	## Quick Start

	```python
	from transformers import AutoModelForCausalLM, AutoTokenizer

	model = AutoModelForCausalLM.from_pretrained("{self.repo_id}")
	tokenizer = AutoTokenizer.from_pretrained("{self.repo_id}")

	# Generate text
	text = "Hello, how are you?"
	inputs = tokenizer(text, return_tensors="pt")
	outputs = model.generate(**inputs, max_new_tokens=100)
	print(tokenizer.decode(outputs[0], skip_special_tokens=True))
	```

	## Model Information

	- Base Model: HuggingFaceTB/SmolLM3-3B
	- Fine-tuning Date: {datetime.now().strftime('%Y-%m-%d')}
	- Model Size: {self._get_model_size():.1f} GB
	- Training Steps: {results.get('total_steps', 'Unknown')}
	- Final Loss: {results.get('final_loss', 'Unknown')}
	- Dataset Repository: {self.dataset_repo}

	## Training Configuration

	```json
	{json.dumps(training_config, indent=2)}
	```

	## Performance Metrics

	```json
	{json.dumps(results, indent=2)}
	```

	## Experiment Tracking

	Training metrics and configuration are stored in the HF Dataset repository: `{self.dataset_repo}`

	## Files

	- `model.safetensors.index.json`: Model weights (safetensors format)
	- `config.json`: Model configuration
	- `tokenizer.json`: Tokenizer configuration
	- `training_results/`: Training logs and results

	## License

	MIT License
	"""

	# Write README to temporary file
	readme_path = Path("temp_readme.md")
	with open(readme_path, "w") as f:
	f.write(readme_content)

	# Upload README
	upload_file(
	path_or_fileobj=str(readme_path),
	path_in_repo="README.md",
	token=self.token,
	repo_id=self.repo_id
	)

	# Clean up
	readme_path.unlink()

	logger.info("✅ README.md uploaded successfully")
	return True

	except Exception as e:
	logger.error(f"❌ Failed to create README: {e}")
	return False


	def push_model(self, training_config: Optional[Dict[str, Any]] = None,
	results: Optional[Dict[str, Any]] = None) -> bool:
	"""Complete model push process"""
	logger.info(f"🚀 Starting model push to {self.repo_id}")
	logger.info(f"📂 Model path: {self.model_path}")
	logger.info(f"🎯 Repository: {self.repo_id}")

	# Validate model path
	if not self.validate_model_path():
	logger.error("❌ Model validation failed. Please check:")
	logger.error(" 1. The model path exists and contains the expected files")
	logger.error(" 2. For LoRA models: adapter_config.json and adapter_model.* files")
	logger.error(" 3. For full models: config.json and model weight files")
	logger.error(" 4. Make sure the training completed successfully and saved the model")
	return False

	# Create repository
	if not self.create_repository():
	return False

	# Load training config and results if not provided
	if training_config is None:
	training_config = self._load_training_config()

	if results is None:
	results = self._load_training_results()

	# Create model card and persist it inside the model directory as README.md
	model_card = self.create_model_card(training_config, results)
	local_readme_path = self.model_path / "README.md"
	try:
	with open(local_readme_path, "w", encoding="utf-8") as f:
	f.write(model_card)
	except Exception as e:
	logger.warning(f"⚠️ Could not write README.md to model directory: {e}")

	# Upload README.md from the model directory
	upload_file(
	path_or_fileobj=str(local_readme_path) if local_readme_path.exists() else model_card,
	path_in_repo="README.md",
	repo_id=self.repo_id,
	token=self.token
	)

	# Upload model files
	if not self.upload_model_files():
	return False

	# Upload training results
	if results:
	self.upload_training_results(str(self.model_path))

	# Log success
	logger.info(f"✅ Model successfully pushed to {self.repo_id}")
	logger.info(f"🎉 Model successfully pushed to: https://huggingface.co/{self.repo_id}")

	return True

	def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool:
	"""Push dataset to Hugging Face Hub including audio files"""
	logger.info(f"🚀 Starting dataset push to {dataset_repo_name}")

	try:
	from huggingface_hub import create_repo, upload_file
	import json

	# Determine full dataset repo name
	if "/" not in dataset_repo_name:
	dataset_repo_name = f"{self.repo_id.split('/')[0]}/{dataset_repo_name}"

	# Create dataset repository
	try:
	create_repo(dataset_repo_name, repo_type="dataset", token=self.token, exist_ok=True)
	logger.info(f"✅ Created dataset repository: {dataset_repo_name}")
	except Exception as e:
	if "already exists" not in str(e).lower():
	logger.error(f"❌ Failed to create dataset repo: {e}")
	return False
	logger.info(f"📁 Dataset repository already exists: {dataset_repo_name}")

	# Read the dataset file
	dataset_file = Path(dataset_path)
	if not dataset_file.exists():
	logger.error(f"❌ Dataset file not found: {dataset_path}")
	return False

	# Read and process the JSONL to collect audio files and update paths
	audio_files = []
	updated_rows = []
	total_audio_size = 0

	with open(dataset_file, 'r', encoding='utf-8') as f:
	for line_num, line in enumerate(f):
	try:
	row = json.loads(line.strip())
	audio_path = row.get("audio_path", "")

	if audio_path:
	audio_file = Path(audio_path)
	if audio_file.exists():
	# Store the original file for upload
	audio_files.append(audio_file)
	total_audio_size += audio_file.stat().st_size

	# Update path to be relative for the dataset
	row["audio_path"] = f"audio/{audio_file.name}"
	else:
	logger.warning(f"Audio file not found: {audio_path}")
	row["audio_path"] = "" # Clear missing files

	updated_rows.append(row)
	except json.JSONDecodeError as e:
	logger.warning(f"Invalid JSON on line {line_num + 1}: {e}")
	continue

	# Create updated JSONL with relative paths
	temp_jsonl_path = dataset_file.parent / "temp_data.jsonl"
	with open(temp_jsonl_path, "w", encoding="utf-8") as f:
	for row in updated_rows:
	f.write(json.dumps(row, ensure_ascii=False) + "\n")

	# Upload the updated JSONL file
	upload_file(
	path_or_fileobj=str(temp_jsonl_path),
	path_in_repo="data.jsonl",
	repo_id=dataset_repo_name,
	repo_type="dataset",
	token=self.token
	)
	logger.info(f"✅ Uploaded dataset file: {dataset_file.name}")

	# Clean up temp file
	temp_jsonl_path.unlink()

	# Upload audio files
	uploaded_count = 0
	for audio_file in audio_files:
	try:
	remote_path = f"audio/{audio_file.name}"
	upload_file(
	path_or_fileobj=str(audio_file),
	path_in_repo=remote_path,
	repo_id=dataset_repo_name,
	repo_type="dataset",
	token=self.token
	)
	uploaded_count += 1
	logger.info(f"✅ Uploaded audio file: {audio_file.name}")
	except Exception as e:
	logger.error(f"❌ Failed to upload {audio_file.name}: {e}")

	# Calculate total dataset size
	total_dataset_size = dataset_file.stat().st_size + total_audio_size

	# Create a comprehensive dataset README
	readme_content = f"""---
	dataset_info:
	features:
	- name: audio_path
	dtype: string
	- name: text
	dtype: string
	splits:
	- name: train
	num_bytes: {dataset_file.stat().st_size}
	num_examples: {len(updated_rows)}
	download_size: {total_dataset_size}
	dataset_size: {total_dataset_size}
	tags:
	- voxtral
	- asr
	- speech-to-text
	- fine-tuning
	- audio-dataset
	- tonic
	---

	# Voxtral ASR Dataset

	This dataset was created for fine-tuning Voxtral ASR models.

	## Dataset Structure

	- audio_path: Relative path to the audio file (stored in `audio/` directory)
	- text: Transcription of the audio

	## Dataset Statistics

	- Number of examples: {len(updated_rows)}
	- Audio files uploaded: {uploaded_count}
	- Total dataset size: {total_dataset_size:,} bytes

	## Usage

	```python
	from datasets import load_dataset, Audio

	# Load dataset
	dataset = load_dataset("{dataset_repo_name}")

	# Load audio data
	dataset = dataset.cast_column("audio_path", Audio())

	# Access first example
	print(dataset[0]["text"])
	print(dataset[0]["audio_path"])
	```

	## Loading with Audio Decoding

	```python
	from datasets import load_dataset, Audio

	# Load with automatic audio decoding
	dataset = load_dataset("{dataset_repo_name}")
	dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))

	# The audio column will contain the decoded audio arrays
	audio_array = dataset[0]["audio_path"]["array"]
	sampling_rate = dataset[0]["audio_path"]["sampling_rate"]
	```

	## Dataset Features

	This dataset contains audio files with corresponding transcriptions for Voxtral ASR model fine-tuning.
	All audio files are stored in the `audio/` directory and referenced using relative paths in the dataset.

	## License

	This dataset is created for research and educational purposes.
	"""

	# Upload README
	readme_path = dataset_file.parent / "README.md"
	with open(readme_path, "w", encoding="utf-8") as f:
	f.write(readme_content)

	upload_file(
	path_or_fileobj=str(readme_path),
	path_in_repo="README.md",
	repo_id=dataset_repo_name,
	repo_type="dataset",
	token=self.token
	)

	readme_path.unlink() # Clean up temp file

	logger.info(f"✅ Dataset README uploaded")
	logger.info(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}")
	logger.info(f"📊 Uploaded {len(updated_rows)} examples and {uploaded_count} audio files")

	return True

	except Exception as e:
	logger.error(f"❌ Failed to push dataset: {e}")
	return False

	def test_dataset_push(self, dataset_path: str) -> bool:
	"""Test dataset validation without uploading to Hugging Face Hub"""
	logger.info(f"🧪 Testing dataset validation for {dataset_path}")

	try:
	# Read the dataset file
	dataset_file = Path(dataset_path)
	if not dataset_file.exists():
	logger.error(f"❌ Dataset file not found: {dataset_path}")
	return False

	# Read and process the JSONL to validate audio files
	audio_files = []
	updated_rows = []
	total_audio_size = 0
	missing_files = []
	invalid_json_lines = []

	with open(dataset_file, 'r', encoding='utf-8') as f:
	for line_num, line in enumerate(f):
	try:
	row = json.loads(line.strip())
	audio_path = row.get("audio_path", "")

	if audio_path:
	audio_file = Path(audio_path)
	if audio_file.exists():
	# Store the file info for validation
	audio_files.append(audio_file)
	total_audio_size += audio_file.stat().st_size
	else:
	missing_files.append(str(audio_path))

	updated_rows.append(row)
	except json.JSONDecodeError as e:
	invalid_json_lines.append(f"Line {line_num + 1}: {e}")
	continue

	# Report validation results
	logger.info("📊 Dataset Validation Results:")
	logger.info(f" - Total examples: {len(updated_rows)}")
	logger.info(f" - Valid audio files: {len(audio_files)}")
	logger.info(f" - Total audio size: {total_audio_size:,} bytes")
	logger.info(f" - Missing audio files: {len(missing_files)}")
	logger.info(f" - Invalid JSON lines: {len(invalid_json_lines)}")

	if missing_files:
	logger.warning("⚠️ Missing audio files:")
	for missing in missing_files[:5]: # Show first 5
	logger.warning(f" - {missing}")
	if len(missing_files) > 5:
	logger.warning(f" ... and {len(missing_files) - 5} more")

	if invalid_json_lines:
	logger.warning("⚠️ Invalid JSON lines:")
	for invalid in invalid_json_lines[:3]: # Show first 3
	logger.warning(f" - {invalid}")
	if len(invalid_json_lines) > 3:
	logger.warning(f" ... and {len(invalid_json_lines) - 3} more")

	# Show sample of how paths will be converted
	if audio_files:
	logger.info("🔄 Path conversion preview:")
	for audio_file in audio_files[:3]: # Show first 3
	logger.info(f" - {str(audio_file)} → audio/{audio_file.name}")

	# Overall validation status
	if len(updated_rows) == 0:
	logger.error("❌ No valid examples found in dataset")
	return False

	if len(missing_files) > 0:
	logger.warning("⚠️ Some audio files are missing - they will be skipped during upload")
	else:
	logger.info("✅ All audio files found and valid")

	logger.info("✅ Dataset validation completed successfully!")
	return True

	except Exception as e:
	logger.error(f"❌ Failed to validate dataset: {e}")
	return False

	def _load_training_config(self) -> Dict[str, Any]:
	"""Load training configuration"""
	config_path = self.model_path / "training_config.json"
	if config_path.exists():
	with open(config_path, "r") as f:
	return json.load(f)
	return {"model_name": "HuggingFaceTB/SmolLM3-3B"}

	def _load_training_results(self) -> Dict[str, Any]:
	"""Load training results"""
	results_path = self.model_path / "train_results.json"
	if results_path.exists():
	with open(results_path, "r") as f:
	return json.load(f)
	return {"final_loss": "Unknown", "total_steps": "Unknown"}

	def parse_args():
	"""Parse command line arguments"""
	parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')

	# Subcommands
	subparsers = parser.add_subparsers(dest='command', help='Available commands')

	# Model push subcommand
	model_parser = subparsers.add_parser('model', help='Push trained model to Hugging Face Hub')
	model_parser.add_argument('model_path', type=str, help='Path to trained model directory')
	model_parser.add_argument('repo_name', type=str, help='Hugging Face repository name (repo-name). Username will be auto-detected from your token.')
	model_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
	model_parser.add_argument('--private', action='store_true', help='Make repository private')
	model_parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
	model_parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
	model_parser.add_argument('--model-name', type=str, default=None, help='Base model name')
	model_parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
	# Optional model card metadata
	model_parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for model card')
	model_parser.add_argument('--dataset-repo', type=str, default=None, help='Dataset repo for model card')
	model_parser.add_argument('--training-config-type', type=str, default=None, help='Training config type for model card')
	model_parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type for model card')
	model_parser.add_argument('--batch-size', type=str, default=None, help='Batch size for model card')
	model_parser.add_argument('--gradient-accumulation-steps', type=str, default=None, help='Grad accum steps for model card')
	model_parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate for model card')
	model_parser.add_argument('--max-epochs', type=str, default=None, help='Max epochs for model card')
	model_parser.add_argument('--max-seq-length', type=str, default=None, help='Max seq length for model card')
	model_parser.add_argument('--trackio-url', type=str, default=None, help='Trackio URL for model card')

	# Dataset push subcommand
	dataset_parser = subparsers.add_parser('dataset', help='Push dataset to Hugging Face Hub')
	dataset_parser.add_argument('dataset_path', type=str, help='Path to dataset JSONL file')
	dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name')
	dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
	dataset_parser.add_argument('--private', action='store_true', help='Make repository private')
	dataset_parser.add_argument('--test', action='store_true', help='Test mode - validate dataset without uploading')

	return parser.parse_args()

	def main():
	"""Main function"""
	args = parse_args()

	# Setup logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)

	if not args.command:
	logger.error("❌ No command specified. Use 'model' or 'dataset' subcommand.")
	return 1

	try:
	if args.command == 'model':
	logger.info("Starting model push to Hugging Face Hub")

	# Initialize pusher
	pusher = HuggingFacePusher(
	model_path=args.model_path,
	repo_name=args.repo_name,
	token=args.token,
	private=args.private,
	author_name=args.author_name,
	model_description=args.model_description,
	model_name=args.model_name,
	dataset_name=args.dataset_name,
	experiment_name=args.experiment_name,
	dataset_repo=args.dataset_repo,
	training_config_type=args.training_config_type,
	trainer_type=args.trainer_type,
	batch_size=args.batch_size,
	gradient_accumulation_steps=args.gradient_accumulation_steps,
	learning_rate=args.learning_rate,
	max_epochs=args.max_epochs,
	max_seq_length=args.max_seq_length,
	trackio_url=args.trackio_url,
	)

	# Push model
	success = pusher.push_model()

	if success:
	logger.info("✅ Model push completed successfully!")
	logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
	else:
	logger.error("❌ Model push failed!")
	return 1

	elif args.command == 'dataset':
	logger.info("Starting dataset push to Hugging Face Hub")

	# Initialize pusher for dataset
	pusher = HuggingFacePusher(
	model_path="", # Not needed for dataset push
	repo_name=args.repo_name,
	token=args.token,
	private=args.private
	)

	if getattr(args, 'test', False):
	# Test mode - validate dataset without uploading
	success = pusher.test_dataset_push(args.dataset_path)
	if success:
	logger.info("✅ Dataset validation completed successfully!")
	else:
	logger.error("❌ Dataset validation failed!")
	return 1
	else:
	# Push dataset
	success = pusher.push_dataset(args.dataset_path, args.repo_name)

	if success:
	logger.info("✅ Dataset push completed successfully!")
	logger.info(f"📊 View your dataset at: https://huggingface.co/datasets/{args.repo_name}")
	else:
	logger.error("❌ Dataset push failed!")
	return 1

	except Exception as e:
	logger.error(f"❌ Error during push: {e}")
	return 1

	return 0

	if __name__ == "__main__":
	exit(main())