#!/usr/bin/env python3
"""
Push Trained Models and Datasets to Hugging Face Hub

Usage:
    # Push a trained model
    python push_to_huggingface.py model /path/to/model my-model-repo

    # Push a dataset
    python push_to_huggingface.py dataset /path/to/dataset.jsonl my-dataset-repo

Authentication:
Set HF_TOKEN environment variable or use --token:
    export HF_TOKEN=your_token_here
"""

import os
import json
import argparse
import logging
from pathlib import Path
from typing import Dict, Any, Optional
from datetime import datetime

# Set timeout for HF operations to prevent hanging
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
os.environ['HF_HUB_UPLOAD_TIMEOUT'] = '600'

try:
    from huggingface_hub import HfApi, create_repo, upload_file
    HF_AVAILABLE = True
except ImportError:
    HF_AVAILABLE = False
    print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")

logger = logging.getLogger(__name__)

class HuggingFacePusher:
    """Push trained models to Hugging Face Hub"""
    
    def __init__(
        self,
        model_path: str,
        repo_name: str,
        token: Optional[str] = None,
        private: bool = False,
        author_name: Optional[str] = None,
        model_description: Optional[str] = None,
        model_name: Optional[str] = None,
        dataset_name: Optional[str] = None,
        # Optional metadata for model card generation
        experiment_name: Optional[str] = None,
        dataset_repo: Optional[str] = None,
        training_config_type: Optional[str] = None,
        trainer_type: Optional[str] = None,
        batch_size: Optional[str] = None,
        gradient_accumulation_steps: Optional[str] = None,
        learning_rate: Optional[str] = None,
        max_epochs: Optional[str] = None,
        max_seq_length: Optional[str] = None,
        trackio_url: Optional[str] = None,
    ):
        self.model_path = Path(model_path)
        # Original user input (may be just the repo name without username)
        self.repo_name = repo_name
        self.token = token or os.getenv('HF_TOKEN')
        self.private = private
        self.author_name = author_name
        self.model_description = model_description

        # Model card generation details
        self.model_name = model_name
        self.dataset_name = dataset_name
        # Optional metadata (ensure attributes always exist to avoid AttributeError)
        self.experiment_name = experiment_name
        self.dataset_repo = dataset_repo
        self.training_config_type = training_config_type
        self.trainer_type = trainer_type
        self.batch_size = batch_size
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.max_seq_length = max_seq_length
        self.trackio_url = trackio_url
        
        # Initialize HF API
        if HF_AVAILABLE:
            self.api = HfApi(token=self.token)
        else:
            raise ImportError("huggingface_hub is required. Install with: pip install huggingface_hub")
        
        # Resolve the full repo id (username/repo) if user only provided repo name
        self.repo_id = self._resolve_repo_id(self.repo_name)
        # Artifact type detection (full vs lora)
        self.artifact_type: Optional[str] = None

        logger.info(f"Initialized HuggingFacePusher for {self.repo_id}")

    def _resolve_repo_id(self, repo_name: str) -> str:
        """Return a fully-qualified repo id in the form username/repo.

        If the provided name already contains a '/', it is returned unchanged.
        Otherwise, we attempt to derive the username from the authenticated token
        or from the HF_USERNAME environment variable.
        """
        try:
            if "/" in repo_name:
                return repo_name

            # Need a username. Prefer API whoami(), fallback to env HF_USERNAME
            username: Optional[str] = None
            if self.token:
                try:
                    user_info = self.api.whoami()
                    username = user_info.get("name") or user_info.get("username")
                except Exception:
                    username = None

            if not username:
                username = os.getenv("HF_USERNAME")

            if not username:
                raise ValueError(
                    "Username could not be determined. Provide a token or set HF_USERNAME, "
                    "or pass a fully-qualified repo id 'username/repo'."
                )

            return f"{username}/{repo_name}"
        except Exception as resolve_error:
            logger.error(f"Failed to resolve full repo id for '{repo_name}': {resolve_error}")
            # Fall back to provided value (may fail later at create/upload)
            return repo_name
    
    def create_repository(self) -> bool:
        """Create the Hugging Face repository"""
        try:
            logger.info(f"Creating repository: {self.repo_id}")
            
            # Create repository with timeout handling
            try:
                # Create repository
                create_repo(
                    repo_id=self.repo_id,
                    token=self.token,
                    private=self.private,
                    exist_ok=True
                )
                
                logger.info(f"✅ Repository created: https://huggingface.co/{self.repo_id}")
                return True
                
            except Exception as e:
                logger.error(f"❌ Repository creation failed: {e}")
                return False
            
        except Exception as e:
            logger.error(f"❌ Failed to create repository: {e}")
            return False
    
    def _detect_artifact_type(self) -> str:
        """Detect whether output dir contains a full model or a LoRA adapter."""
        logger.info(f"Detecting model artifacts in: {self.model_path}")

        # Check if path exists
        if not self.model_path.exists():
            logger.error(f"❌ Model path does not exist: {self.model_path}")
            return "unknown"

        # List all files for debugging
        all_files = list(self.model_path.rglob("*"))
        logger.info(f"📁 Found {len(all_files)} files in model directory")
        if len(all_files) <= 20:  # Only show if not too many files
            for f in all_files:
                logger.info(f"   - {f.relative_to(self.model_path)}")

        # LoRA artifacts - be more flexible about file combinations
        lora_config = self.model_path / "adapter_config.json"
        lora_weights_safetensors = self.model_path / "adapter_model.safetensors"
        lora_weights_bin = self.model_path / "adapter_model.bin"

        has_lora_config = lora_config.exists()
        has_lora_weights = lora_weights_safetensors.exists() or lora_weights_bin.exists()

        if has_lora_config:
            logger.info("✅ Found adapter_config.json")
        if has_lora_weights:
            logger.info("✅ Found LoRA weight files")

        if has_lora_config and has_lora_weights:
            logger.info("🎯 Detected LoRA adapter artifacts")
            return "lora"
        elif has_lora_config:
            logger.warning("⚠️ Found adapter_config.json but no weight files")
        elif has_lora_weights:
            logger.warning("⚠️ Found LoRA weight files but no adapter_config.json")

        # Full model artifacts - also be more flexible
        config_file = self.model_path / "config.json"
        safetensors_model = self.model_path / "model.safetensors"
        safetensors_index = self.model_path / "model.safetensors.index.json"
        pytorch_model = self.model_path / "pytorch_model.bin"

        has_config = config_file.exists()
        has_weights = (safetensors_model.exists() or safetensors_index.exists() or pytorch_model.exists())

        if has_config:
            logger.info("✅ Found config.json")
        if has_weights:
            logger.info("✅ Found model weight files")

        if has_config and has_weights:
            logger.info("🎯 Detected full model artifacts")
            return "full"
        elif has_config:
            logger.warning("⚠️ Found config.json but no weight files")
        elif has_weights:
            logger.warning("⚠️ Found weight files but no config.json")

        logger.error("❌ Could not detect model artifacts (neither full model nor LoRA)")
        return "unknown"

    def validate_model_path(self) -> bool:
        """Validate that the model path contains required files for Voxtral full or LoRA."""
        self.artifact_type = self._detect_artifact_type()

        if self.artifact_type == "unknown":
            logger.error("❌ Could not detect model type. Expected files:")
            logger.error("   For LoRA: adapter_config.json + adapter_model.safetensors (or .bin)")
            logger.error("   For Full Model: config.json + model.safetensors (or pytorch_model.bin)")
            logger.error("   For Voxtral ASR: also look for processor_config.json, tokenizer.json, etc.")
            return False

        if self.artifact_type == "lora":
            # Check for required LoRA files
            config_file = self.model_path / "adapter_config.json"
            weights_file_safetensors = self.model_path / "adapter_model.safetensors"
            weights_file_bin = self.model_path / "adapter_model.bin"

            if not config_file.exists():
                logger.error("❌ LoRA adapter missing required file: adapter_config.json")
                return False

            if not (weights_file_safetensors.exists() or weights_file_bin.exists()):
                logger.error("❌ LoRA adapter missing weight files: adapter_model.safetensors or adapter_model.bin")
                return False

            logger.info("✅ LoRA adapter validation successful")
            logger.info(f"   - Config: {config_file.name}")
            if weights_file_safetensors.exists():
                logger.info(f"   - Weights: {weights_file_safetensors.name}")
            elif weights_file_bin.exists():
                logger.info(f"   - Weights: {weights_file_bin.name}")

            return True

        if self.artifact_type == "full":
            # Check for required full model files
            config_file = self.model_path / "config.json"
            safetensors_file = self.model_path / "model.safetensors"
            safetensors_index = self.model_path / "model.safetensors.index.json"
            pytorch_file = self.model_path / "pytorch_model.bin"

            if not config_file.exists():
                logger.error("❌ Full model missing required file: config.json")
                return False

            if not (safetensors_file.exists() or safetensors_index.exists() or pytorch_file.exists()):
                logger.error("❌ Full model missing weight files: model.safetensors, model.safetensors.index.json, or pytorch_model.bin")
                return False

            logger.info("✅ Full model validation successful")
            logger.info(f"   - Config: {config_file.name}")
            if safetensors_file.exists():
                logger.info(f"   - Weights: {safetensors_file.name}")
            elif safetensors_index.exists():
                logger.info(f"   - Weights: {safetensors_index.name} (sharded)")
            elif pytorch_file.exists():
                logger.info(f"   - Weights: {pytorch_file.name}")

            return True

        return False
    
    def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
        """Create a comprehensive model card using the generate_model_card.py script"""
        try:
            # Import the model card generator
            import sys
            sys.path.append(os.path.join(os.path.dirname(__file__)))
            from generate_model_card import ModelCardGenerator, create_default_variables
            
            # Create generator
            generator = ModelCardGenerator()
            
            # Create variables for the model card
            variables = create_default_variables()

            # Determine whether dataset_name looks like a valid Hub dataset id (owner/dataset)
            hub_dataset = (self.dataset_name or "").strip()
            has_hub_dataset_id = bool(hub_dataset and "/" in hub_dataset and " " not in hub_dataset and len(hub_dataset.split("/")) == 2)

            # Update with actual values
            variables.update({
                "repo_name": self.repo_id,
                "model_name": self.repo_id.split('/')[-1],
                "experiment_name": self.experiment_name or "model_push",
                "dataset_repo": self.dataset_repo or "",
                "author_name": self.author_name or "Model Author",
                "model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
                "training_config_type": self.training_config_type or "Custom Configuration",
                "base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
                "dataset_name": hub_dataset if hub_dataset else "",
                "has_hub_dataset_id": has_hub_dataset_id,
                # Only include model-index when a dataset is provided or when metrics are meaningful
                "include_model_index": bool(hub_dataset),
                "trainer_type": self.trainer_type or "SFTTrainer",
                "batch_size": str(self.batch_size) if self.batch_size else "8",
                "gradient_accumulation_steps": str(self.gradient_accumulation_steps) if self.gradient_accumulation_steps else variables.get("gradient_accumulation_steps", "16"),
                "learning_rate": str(self.learning_rate) if self.learning_rate else "5e-6",
                "max_epochs": str(self.max_epochs) if self.max_epochs else "3",
                "max_seq_length": str(self.max_seq_length) if self.max_seq_length else "2048",
                "hardware_info": self._get_hardware_info(),
                "trackio_url": self.trackio_url or "N/A",
                "training_loss": str(results.get('train_loss', 'N/A')),
                "validation_loss": str(results.get('eval_loss', 'N/A')),
                "perplexity": str(results.get('perplexity', 'N/A')),
                "quantized_models": False  # Set to True if quantized models are available
            })
            
            # Generate the model card
            model_card_content = generator.generate_model_card(variables)
            
            logger.info("✅ Model card generated using generate_model_card.py")
            return model_card_content
            
        except Exception as e:
            logger.error(f"❌ Failed to generate model card with generator: {e}")
            logger.info("🔄 Falling back to simple model card")
            return self._create_simple_model_card(training_config, results)
    
    def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
        """Create a simple model card tailored for Voxtral ASR (supports full and LoRA)."""
        tags = ["voxtral", "asr", "speech-to-text", "fine-tuning"]
        if self.artifact_type == "lora":
            tags.append("lora")
        front_matter = {
            "license": "apache-2.0",
            "tags": tags,
            "pipeline_tag": "automatic-speech-recognition",
        }
        fm_yaml = "---\n" + "\n".join([
            "license: apache-2.0",
            "tags:",
        ]) + "\n" + "\n".join([f"- {t}" for t in tags]) + "\n" + "pipeline_tag: automatic-speech-recognition\n---\n\n"
        model_title = self.repo_id.split('/')[-1]
        body = [
            f"# {model_title}",
            "",
            ("This repository contains a LoRA adapter for Voxtral ASR. "
             "Merge the adapter with the base model or load via PEFT for inference." if self.artifact_type == "lora" else
             "This repository contains a fine-tuned Voxtral ASR model."),
            "",
            "## Usage",
            "",
            ("```python\nfrom transformers import AutoProcessor\nfrom peft import PeftModel\nfrom transformers import AutoModelForSeq2SeqLM\n\nbase_model_id = 'mistralai/Voxtral-Mini-3B-2507'\nprocessor = AutoProcessor.from_pretrained(base_model_id)\nbase_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)\nmodel = PeftModel.from_pretrained(base_model, '{self.repo_id}')\n```" if self.artifact_type == "lora" else
             f"""```python
from transformers import AutoProcessor, AutoModelForSeq2SeqLM

processor = AutoProcessor.from_pretrained("{self.repo_id}")
model = AutoModelForSeq2SeqLM.from_pretrained("{self.repo_id}")
```"""),
            "",
            "## Training Configuration",
            "",
            f"```json\n{json.dumps(training_config or {}, indent=2)}\n```",
            "",
            "## Training Results",
            "",
            f"```json\n{json.dumps(results or {}, indent=2)}\n```",
            "",
            f"**Hardware**: {self._get_hardware_info()}",
        ]
        return fm_yaml + "\n".join(body)
    
    def _get_model_size(self) -> float:
        """Get model size in GB"""
        try:
            total_size = 0
            for file in self.model_path.rglob("*"):
                if file.is_file():
                    total_size += file.stat().st_size
            return total_size / (1024**3)  # Convert to GB
        except:
            return 0.0
    
    def _get_hardware_info(self) -> str:
        """Get hardware information"""
        try:
            import torch
            if torch.cuda.is_available():
                gpu_name = torch.cuda.get_device_name(0)
                return f"GPU: {gpu_name}"
            else:
                return "CPU"
        except:
            return "Unknown"
    
    def upload_model_files(self) -> bool:
        """Upload model files to Hugging Face Hub with timeout protection"""
        try:
            logger.info("Uploading model files...")
            
            # Upload all files in the model directory
            for file_path in self.model_path.rglob("*"):
                if file_path.is_file():
                    relative_path = file_path.relative_to(self.model_path)
                    remote_path = str(relative_path)
                    
                    logger.info(f"Uploading {relative_path}")
                    
                    try:
                        upload_file(
                            path_or_fileobj=str(file_path),
                            path_in_repo=remote_path,
                            repo_id=self.repo_id,
                            token=self.token
                        )
                        logger.info(f"✅ Uploaded {relative_path}")
                        
                    except Exception as e:
                        logger.error(f"❌ Failed to upload {relative_path}: {e}")
                        return False
            
            logger.info("✅ Model files uploaded successfully")
            return True
            
        except Exception as e:
            logger.error(f"❌ Failed to upload model files: {e}")
            return False
    
    def upload_training_results(self, results_path: str) -> bool:
        """Upload training results and logs"""
        try:
            logger.info("Uploading training results...")
            
            results_files = [
                "train_results.json",
                "eval_results.json",
                "training_config.json",
                "training.log"
            ]
            
            for file_name in results_files:
                file_path = Path(results_path) / file_name
                if file_path.exists():
                    logger.info(f"Uploading {file_name}")
                    upload_file(
                        path_or_fileobj=str(file_path),
                        path_in_repo=f"training_results/{file_name}",
                        repo_id=self.repo_id,
                        token=self.token
                    )
            
            logger.info("✅ Training results uploaded successfully")
            return True
            
        except Exception as e:
            logger.error(f"❌ Failed to upload training results: {e}")
            return False
    
    def create_readme(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> bool:
        """Create and upload README.md"""
        try:
            logger.info("Creating README.md...")
            
            readme_content = f"""# {self.repo_id.split('/')[-1]}

A fine-tuned SmolLM3 model for text generation tasks.

## Quick Start

```python
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("{self.repo_id}")
tokenizer = AutoTokenizer.from_pretrained("{self.repo_id}")

# Generate text
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```

## Model Information

- **Base Model**: HuggingFaceTB/SmolLM3-3B
- **Fine-tuning Date**: {datetime.now().strftime('%Y-%m-%d')}
- **Model Size**: {self._get_model_size():.1f} GB
- **Training Steps**: {results.get('total_steps', 'Unknown')}
- **Final Loss**: {results.get('final_loss', 'Unknown')}
- **Dataset Repository**: {self.dataset_repo}

## Training Configuration

```json
{json.dumps(training_config, indent=2)}
```

## Performance Metrics

```json
{json.dumps(results, indent=2)}
```

## Experiment Tracking

Training metrics and configuration are stored in the HF Dataset repository: `{self.dataset_repo}`

## Files

- `model.safetensors.index.json`: Model weights (safetensors format)
- `config.json`: Model configuration
- `tokenizer.json`: Tokenizer configuration
- `training_results/`: Training logs and results

## License

MIT License
"""
            
            # Write README to temporary file
            readme_path = Path("temp_readme.md")
            with open(readme_path, "w") as f:
                f.write(readme_content)
            
            # Upload README
            upload_file(
                path_or_fileobj=str(readme_path),
                path_in_repo="README.md",
                token=self.token,
                repo_id=self.repo_id
            )
            
            # Clean up
            readme_path.unlink()
            
            logger.info("✅ README.md uploaded successfully")
            return True
            
        except Exception as e:
            logger.error(f"❌ Failed to create README: {e}")
            return False
    

    def push_model(self, training_config: Optional[Dict[str, Any]] = None,
                   results: Optional[Dict[str, Any]] = None) -> bool:
        """Complete model push process"""
        logger.info(f"🚀 Starting model push to {self.repo_id}")
        logger.info(f"📂 Model path: {self.model_path}")
        logger.info(f"🎯 Repository: {self.repo_id}")

        # Validate model path
        if not self.validate_model_path():
            logger.error("❌ Model validation failed. Please check:")
            logger.error("   1. The model path exists and contains the expected files")
            logger.error("   2. For LoRA models: adapter_config.json and adapter_model.* files")
            logger.error("   3. For full models: config.json and model weight files")
            logger.error("   4. Make sure the training completed successfully and saved the model")
            return False

        # Create repository
        if not self.create_repository():
            return False

        # Load training config and results if not provided
        if training_config is None:
            training_config = self._load_training_config()

        if results is None:
            results = self._load_training_results()

        # Create model card and persist it inside the model directory as README.md
        model_card = self.create_model_card(training_config, results)
        local_readme_path = self.model_path / "README.md"
        try:
            with open(local_readme_path, "w", encoding="utf-8") as f:
                f.write(model_card)
        except Exception as e:
            logger.warning(f"⚠️ Could not write README.md to model directory: {e}")

        # Upload README.md from the model directory
        upload_file(
            path_or_fileobj=str(local_readme_path) if local_readme_path.exists() else model_card,
            path_in_repo="README.md",
            repo_id=self.repo_id,
            token=self.token
        )

        # Upload model files
        if not self.upload_model_files():
            return False

        # Upload training results
        if results:
            self.upload_training_results(str(self.model_path))

        # Log success
        logger.info(f"✅ Model successfully pushed to {self.repo_id}")
        logger.info(f"🎉 Model successfully pushed to: https://huggingface.co/{self.repo_id}")

        return True

    def push_dataset(self, dataset_path: str, dataset_repo_name: str) -> bool:
        """Push dataset to Hugging Face Hub including audio files"""
        logger.info(f"🚀 Starting dataset push to {dataset_repo_name}")

        try:
            from huggingface_hub import create_repo, upload_file
            import json

            # Determine full dataset repo name
            if "/" not in dataset_repo_name:
                dataset_repo_name = f"{self.repo_id.split('/')[0]}/{dataset_repo_name}"

            # Create dataset repository
            try:
                create_repo(dataset_repo_name, repo_type="dataset", token=self.token, exist_ok=True)
                logger.info(f"✅ Created dataset repository: {dataset_repo_name}")
            except Exception as e:
                if "already exists" not in str(e).lower():
                    logger.error(f"❌ Failed to create dataset repo: {e}")
                    return False
                logger.info(f"📁 Dataset repository already exists: {dataset_repo_name}")

            # Read the dataset file
            dataset_file = Path(dataset_path)
            if not dataset_file.exists():
                logger.error(f"❌ Dataset file not found: {dataset_path}")
                return False

            # Read and process the JSONL to collect audio files and update paths
            audio_files = []
            updated_rows = []
            total_audio_size = 0

            with open(dataset_file, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f):
                    try:
                        row = json.loads(line.strip())
                        audio_path = row.get("audio_path", "")

                        if audio_path:
                            audio_file = Path(audio_path)
                            if audio_file.exists():
                                # Store the original file for upload
                                audio_files.append(audio_file)
                                total_audio_size += audio_file.stat().st_size

                                # Update path to be relative for the dataset
                                row["audio_path"] = f"audio/{audio_file.name}"
                            else:
                                logger.warning(f"Audio file not found: {audio_path}")
                                row["audio_path"] = ""  # Clear missing files

                        updated_rows.append(row)
                    except json.JSONDecodeError as e:
                        logger.warning(f"Invalid JSON on line {line_num + 1}: {e}")
                        continue

            # Create updated JSONL with relative paths
            temp_jsonl_path = dataset_file.parent / "temp_data.jsonl"
            with open(temp_jsonl_path, "w", encoding="utf-8") as f:
                for row in updated_rows:
                    f.write(json.dumps(row, ensure_ascii=False) + "\n")

            # Upload the updated JSONL file
            upload_file(
                path_or_fileobj=str(temp_jsonl_path),
                path_in_repo="data.jsonl",
                repo_id=dataset_repo_name,
                repo_type="dataset",
                token=self.token
            )
            logger.info(f"✅ Uploaded dataset file: {dataset_file.name}")

            # Clean up temp file
            temp_jsonl_path.unlink()

            # Upload audio files
            uploaded_count = 0
            for audio_file in audio_files:
                try:
                    remote_path = f"audio/{audio_file.name}"
                    upload_file(
                        path_or_fileobj=str(audio_file),
                        path_in_repo=remote_path,
                        repo_id=dataset_repo_name,
                        repo_type="dataset",
                        token=self.token
                    )
                    uploaded_count += 1
                    logger.info(f"✅ Uploaded audio file: {audio_file.name}")
                except Exception as e:
                    logger.error(f"❌ Failed to upload {audio_file.name}: {e}")

            # Calculate total dataset size
            total_dataset_size = dataset_file.stat().st_size + total_audio_size

            # Create a comprehensive dataset README
            readme_content = f"""---
dataset_info:
  features:
    - name: audio_path
      dtype: string
    - name: text
      dtype: string
  splits:
    - name: train
      num_bytes: {dataset_file.stat().st_size}
      num_examples: {len(updated_rows)}
  download_size: {total_dataset_size}
  dataset_size: {total_dataset_size}
tags:
- voxtral
- asr
- speech-to-text
- fine-tuning
- audio-dataset
- tonic
---

# Voxtral ASR Dataset

This dataset was created for fine-tuning Voxtral ASR models.

## Dataset Structure

- **audio_path**: Relative path to the audio file (stored in `audio/` directory)
- **text**: Transcription of the audio

## Dataset Statistics

- **Number of examples**: {len(updated_rows)}
- **Audio files uploaded**: {uploaded_count}
- **Total dataset size**: {total_dataset_size:,} bytes

## Usage

```python
from datasets import load_dataset, Audio

# Load dataset
dataset = load_dataset("{dataset_repo_name}")

# Load audio data
dataset = dataset.cast_column("audio_path", Audio())

# Access first example
print(dataset[0]["text"])
print(dataset[0]["audio_path"])
```

## Loading with Audio Decoding

```python
from datasets import load_dataset, Audio

# Load with automatic audio decoding
dataset = load_dataset("{dataset_repo_name}")
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))

# The audio column will contain the decoded audio arrays
audio_array = dataset[0]["audio_path"]["array"]
sampling_rate = dataset[0]["audio_path"]["sampling_rate"]
```

## Dataset Features

This dataset contains audio files with corresponding transcriptions for Voxtral ASR model fine-tuning.
All audio files are stored in the `audio/` directory and referenced using relative paths in the dataset.

## License

This dataset is created for research and educational purposes.
"""

            # Upload README
            readme_path = dataset_file.parent / "README.md"
            with open(readme_path, "w", encoding="utf-8") as f:
                f.write(readme_content)

            upload_file(
                path_or_fileobj=str(readme_path),
                path_in_repo="README.md",
                repo_id=dataset_repo_name,
                repo_type="dataset",
                token=self.token
            )

            readme_path.unlink()  # Clean up temp file

            logger.info(f"✅ Dataset README uploaded")
            logger.info(f"🎉 Dataset successfully pushed to: https://huggingface.co/datasets/{dataset_repo_name}")
            logger.info(f"📊 Uploaded {len(updated_rows)} examples and {uploaded_count} audio files")

            return True

        except Exception as e:
            logger.error(f"❌ Failed to push dataset: {e}")
            return False

    def test_dataset_push(self, dataset_path: str) -> bool:
        """Test dataset validation without uploading to Hugging Face Hub"""
        logger.info(f"🧪 Testing dataset validation for {dataset_path}")

        try:
            # Read the dataset file
            dataset_file = Path(dataset_path)
            if not dataset_file.exists():
                logger.error(f"❌ Dataset file not found: {dataset_path}")
                return False

            # Read and process the JSONL to validate audio files
            audio_files = []
            updated_rows = []
            total_audio_size = 0
            missing_files = []
            invalid_json_lines = []

            with open(dataset_file, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f):
                    try:
                        row = json.loads(line.strip())
                        audio_path = row.get("audio_path", "")

                        if audio_path:
                            audio_file = Path(audio_path)
                            if audio_file.exists():
                                # Store the file info for validation
                                audio_files.append(audio_file)
                                total_audio_size += audio_file.stat().st_size
                            else:
                                missing_files.append(str(audio_path))

                        updated_rows.append(row)
                    except json.JSONDecodeError as e:
                        invalid_json_lines.append(f"Line {line_num + 1}: {e}")
                        continue

            # Report validation results
            logger.info("📊 Dataset Validation Results:")
            logger.info(f"   - Total examples: {len(updated_rows)}")
            logger.info(f"   - Valid audio files: {len(audio_files)}")
            logger.info(f"   - Total audio size: {total_audio_size:,} bytes")
            logger.info(f"   - Missing audio files: {len(missing_files)}")
            logger.info(f"   - Invalid JSON lines: {len(invalid_json_lines)}")

            if missing_files:
                logger.warning("⚠️ Missing audio files:")
                for missing in missing_files[:5]:  # Show first 5
                    logger.warning(f"   - {missing}")
                if len(missing_files) > 5:
                    logger.warning(f"   ... and {len(missing_files) - 5} more")

            if invalid_json_lines:
                logger.warning("⚠️ Invalid JSON lines:")
                for invalid in invalid_json_lines[:3]:  # Show first 3
                    logger.warning(f"   - {invalid}")
                if len(invalid_json_lines) > 3:
                    logger.warning(f"   ... and {len(invalid_json_lines) - 3} more")

            # Show sample of how paths will be converted
            if audio_files:
                logger.info("🔄 Path conversion preview:")
                for audio_file in audio_files[:3]:  # Show first 3
                    logger.info(f"   - {str(audio_file)} → audio/{audio_file.name}")

            # Overall validation status
            if len(updated_rows) == 0:
                logger.error("❌ No valid examples found in dataset")
                return False

            if len(missing_files) > 0:
                logger.warning("⚠️ Some audio files are missing - they will be skipped during upload")
            else:
                logger.info("✅ All audio files found and valid")

            logger.info("✅ Dataset validation completed successfully!")
            return True

        except Exception as e:
            logger.error(f"❌ Failed to validate dataset: {e}")
            return False

    def _load_training_config(self) -> Dict[str, Any]:
        """Load training configuration"""
        config_path = self.model_path / "training_config.json"
        if config_path.exists():
            with open(config_path, "r") as f:
                return json.load(f)
        return {"model_name": "HuggingFaceTB/SmolLM3-3B"}
    
    def _load_training_results(self) -> Dict[str, Any]:
        """Load training results"""
        results_path = self.model_path / "train_results.json"
        if results_path.exists():
            with open(results_path, "r") as f:
                return json.load(f)
        return {"final_loss": "Unknown", "total_steps": "Unknown"}

def parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description='Push trained model to Hugging Face Hub')
    
    # Subcommands
    subparsers = parser.add_subparsers(dest='command', help='Available commands')

    # Model push subcommand
    model_parser = subparsers.add_parser('model', help='Push trained model to Hugging Face Hub')
    model_parser.add_argument('model_path', type=str, help='Path to trained model directory')
    model_parser.add_argument('repo_name', type=str, help='Hugging Face repository name (repo-name). Username will be auto-detected from your token.')
    model_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
    model_parser.add_argument('--private', action='store_true', help='Make repository private')
    model_parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
    model_parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
    model_parser.add_argument('--model-name', type=str, default=None, help='Base model name')
    model_parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
    # Optional model card metadata
    model_parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for model card')
    model_parser.add_argument('--dataset-repo', type=str, default=None, help='Dataset repo for model card')
    model_parser.add_argument('--training-config-type', type=str, default=None, help='Training config type for model card')
    model_parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type for model card')
    model_parser.add_argument('--batch-size', type=str, default=None, help='Batch size for model card')
    model_parser.add_argument('--gradient-accumulation-steps', type=str, default=None, help='Grad accum steps for model card')
    model_parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate for model card')
    model_parser.add_argument('--max-epochs', type=str, default=None, help='Max epochs for model card')
    model_parser.add_argument('--max-seq-length', type=str, default=None, help='Max seq length for model card')
    model_parser.add_argument('--trackio-url', type=str, default=None, help='Trackio URL for model card')

    # Dataset push subcommand
    dataset_parser = subparsers.add_parser('dataset', help='Push dataset to Hugging Face Hub')
    dataset_parser.add_argument('dataset_path', type=str, help='Path to dataset JSONL file')
    dataset_parser.add_argument('repo_name', type=str, help='Hugging Face dataset repository name')
    dataset_parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
    dataset_parser.add_argument('--private', action='store_true', help='Make repository private')
    dataset_parser.add_argument('--test', action='store_true', help='Test mode - validate dataset without uploading')
    
    return parser.parse_args()

def main():
    """Main function"""
    args = parse_args()

    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )

    if not args.command:
        logger.error("❌ No command specified. Use 'model' or 'dataset' subcommand.")
        return 1

    try:
        if args.command == 'model':
            logger.info("Starting model push to Hugging Face Hub")

            # Initialize pusher
            pusher = HuggingFacePusher(
                model_path=args.model_path,
                repo_name=args.repo_name,
                token=args.token,
                private=args.private,
                author_name=args.author_name,
                model_description=args.model_description,
                model_name=args.model_name,
                dataset_name=args.dataset_name,
                experiment_name=args.experiment_name,
                dataset_repo=args.dataset_repo,
                training_config_type=args.training_config_type,
                trainer_type=args.trainer_type,
                batch_size=args.batch_size,
                gradient_accumulation_steps=args.gradient_accumulation_steps,
                learning_rate=args.learning_rate,
                max_epochs=args.max_epochs,
                max_seq_length=args.max_seq_length,
                trackio_url=args.trackio_url,
            )

            # Push model
            success = pusher.push_model()

            if success:
                logger.info("✅ Model push completed successfully!")
                logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
            else:
                logger.error("❌ Model push failed!")
                return 1

        elif args.command == 'dataset':
            logger.info("Starting dataset push to Hugging Face Hub")

            # Initialize pusher for dataset
            pusher = HuggingFacePusher(
                model_path="",  # Not needed for dataset push
                repo_name=args.repo_name,
                token=args.token,
                private=args.private
            )

            if getattr(args, 'test', False):
                # Test mode - validate dataset without uploading
                success = pusher.test_dataset_push(args.dataset_path)
                if success:
                    logger.info("✅ Dataset validation completed successfully!")
                else:
                    logger.error("❌ Dataset validation failed!")
                    return 1
            else:
                # Push dataset
                success = pusher.push_dataset(args.dataset_path, args.repo_name)

                if success:
                    logger.info("✅ Dataset push completed successfully!")
                    logger.info(f"📊 View your dataset at: https://huggingface.co/datasets/{args.repo_name}")
                else:
                    logger.error("❌ Dataset push failed!")
                    return 1

    except Exception as e:
        logger.error(f"❌ Error during push: {e}")
        return 1

    return 0

if __name__ == "__main__":
    exit(main())