Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Model Recovery and Deployment Script | |
Recovers trained model from cloud instance, quantizes it, and pushes to Hugging Face Hub | |
""" | |
import os | |
import sys | |
import json | |
import argparse | |
import logging | |
import subprocess | |
from pathlib import Path | |
from typing import Dict, Any, Optional | |
from datetime import datetime | |
# Setup logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger(__name__) | |
sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) | |
class ModelRecoveryPipeline: | |
"""Complete model recovery and deployment pipeline""" | |
def __init__( | |
self, | |
model_path: str, | |
repo_name: str, | |
hf_token: Optional[str] = None, | |
private: bool = False, | |
quantize: bool = True, | |
quant_types: Optional[list] = None, | |
trackio_url: Optional[str] = None, | |
experiment_name: Optional[str] = None, | |
dataset_repo: Optional[str] = None, | |
author_name: Optional[str] = None, | |
model_description: Optional[str] = None | |
): | |
self.model_path = Path(model_path) | |
self.repo_name = repo_name | |
self.hf_token = hf_token or os.getenv('HF_TOKEN') | |
self.private = private | |
self.quantize = quantize | |
self.quant_types = quant_types or ["int8_weight_only", "int4_weight_only"] | |
self.trackio_url = trackio_url | |
self.experiment_name = experiment_name | |
self.dataset_repo = dataset_repo | |
self.author_name = author_name | |
self.model_description = model_description | |
# Validate HF token | |
if not self.hf_token: | |
raise ValueError("HF_TOKEN environment variable or --hf-token argument is required") | |
logger.info(f"Initialized ModelRecoveryPipeline for {repo_name}") | |
logger.info(f"Model path: {self.model_path}") | |
logger.info(f"Quantization enabled: {self.quantize}") | |
if self.quantize: | |
logger.info(f"Quantization types: {self.quant_types}") | |
def validate_model_path(self) -> bool: | |
"""Validate that the model path contains required files""" | |
if not self.model_path.exists(): | |
logger.error(f"β Model path does not exist: {self.model_path}") | |
return False | |
# Check for essential model files | |
required_files = ['config.json'] | |
# Check for model files (either safetensors or pytorch) | |
model_files = [ | |
"model.safetensors.index.json", # Safetensors format | |
"pytorch_model.bin" # PyTorch format | |
] | |
missing_files = [] | |
for file in required_files: | |
if not (self.model_path / file).exists(): | |
missing_files.append(file) | |
# Check if at least one model file exists | |
model_file_exists = any((self.model_path / file).exists() for file in model_files) | |
if not model_file_exists: | |
missing_files.extend(model_files) | |
if missing_files: | |
logger.error(f"β Missing required model files: {missing_files}") | |
return False | |
logger.info("β Model files validated") | |
return True | |
def load_training_config(self) -> Dict[str, Any]: | |
"""Load training configuration from model directory""" | |
config_files = [ | |
"training_config.json", | |
"config_petite_llm_3_fr_1_20250727_152504.json", | |
"config_petite_llm_3_fr_1_20250727_152524.json" | |
] | |
for config_file in config_files: | |
config_path = self.model_path / config_file | |
if config_path.exists(): | |
with open(config_path, 'r') as f: | |
config = json.load(f) | |
logger.info(f"β Loaded training config from: {config_file}") | |
return config | |
# Fallback to basic config | |
logger.warning("β οΈ No training config found, using default") | |
return { | |
"model_name": "HuggingFaceTB/SmolLM3-3B", | |
"dataset_name": "OpenHermes-FR", | |
"training_config_type": "Custom Configuration", | |
"trainer_type": "SFTTrainer", | |
"per_device_train_batch_size": 8, | |
"gradient_accumulation_steps": 16, | |
"learning_rate": "5e-6", | |
"num_train_epochs": 3, | |
"max_seq_length": 2048, | |
"dataset_size": "~80K samples", | |
"dataset_format": "Chat format" | |
} | |
def load_training_results(self) -> Dict[str, Any]: | |
"""Load training results from model directory""" | |
results_files = [ | |
"train_results.json", | |
"training_summary_petite_llm_3_fr_1_20250727_152504.json", | |
"training_summary_petite_llm_3_fr_1_20250727_152524.json" | |
] | |
for results_file in results_files: | |
results_path = self.model_path / results_file | |
if results_path.exists(): | |
with open(results_path, 'r') as f: | |
results = json.load(f) | |
logger.info(f"β Loaded training results from: {results_file}") | |
return results | |
# Fallback to basic results | |
logger.warning("β οΈ No training results found, using default") | |
return { | |
"final_loss": "Unknown", | |
"total_steps": "Unknown", | |
"train_loss": "Unknown", | |
"eval_loss": "Unknown" | |
} | |
def push_main_model(self) -> bool: | |
"""Push the main model to Hugging Face Hub""" | |
try: | |
logger.info("π Pushing main model to Hugging Face Hub...") | |
# Import push script | |
from scripts.model_tonic.push_to_huggingface import HuggingFacePusher | |
# Load training data | |
training_config = self.load_training_config() | |
training_results = self.load_training_results() | |
# Initialize pusher | |
pusher = HuggingFacePusher( | |
model_path=str(self.model_path), | |
repo_name=self.repo_name, | |
token=self.hf_token, | |
private=self.private, | |
trackio_url=self.trackio_url, | |
experiment_name=self.experiment_name, | |
dataset_repo=self.dataset_repo, | |
hf_token=self.hf_token, | |
author_name=self.author_name, | |
model_description=self.model_description | |
) | |
# Push model | |
success = pusher.push_model(training_config, training_results) | |
if success: | |
logger.info(f"β Main model pushed successfully to: https://huggingface.co/{self.repo_name}") | |
return True | |
else: | |
logger.error("β Failed to push main model") | |
return False | |
except Exception as e: | |
logger.error(f"β Error pushing main model: {e}") | |
return False | |
def quantize_and_push_models(self) -> bool: | |
"""Quantize and push models to Hugging Face Hub""" | |
if not self.quantize: | |
logger.info("βοΈ Skipping quantization (disabled)") | |
return True | |
try: | |
logger.info("π Starting quantization and push process...") | |
# Import quantization script | |
from scripts.model_tonic.quantize_model import ModelQuantizer | |
success_count = 0 | |
total_count = len(self.quant_types) | |
for quant_type in self.quant_types: | |
logger.info(f"π Processing quantization type: {quant_type}") | |
# Initialize quantizer | |
quantizer = ModelQuantizer( | |
model_path=str(self.model_path), | |
repo_name=self.repo_name, | |
token=self.hf_token, | |
private=self.private, | |
trackio_url=self.trackio_url, | |
experiment_name=self.experiment_name, | |
dataset_repo=self.dataset_repo, | |
hf_token=self.hf_token | |
) | |
# Perform quantization and push | |
success = quantizer.quantize_and_push( | |
quant_type=quant_type, | |
device="auto", | |
group_size=128 | |
) | |
if success: | |
logger.info(f"β {quant_type} quantization and push completed") | |
success_count += 1 | |
else: | |
logger.error(f"β {quant_type} quantization and push failed") | |
logger.info(f"π Quantization summary: {success_count}/{total_count} successful") | |
return success_count > 0 | |
except Exception as e: | |
logger.error(f"β Error during quantization: {e}") | |
return False | |
def run_complete_pipeline(self) -> bool: | |
"""Run the complete model recovery and deployment pipeline""" | |
logger.info("π Starting complete model recovery and deployment pipeline") | |
# Step 1: Validate model path | |
if not self.validate_model_path(): | |
logger.error("β Model validation failed") | |
return False | |
# Step 2: Push main model | |
if not self.push_main_model(): | |
logger.error("β Main model push failed") | |
return False | |
# Step 3: Quantize and push models | |
if not self.quantize_and_push_models(): | |
logger.warning("β οΈ Quantization failed, but main model was pushed successfully") | |
logger.info("π Model recovery and deployment pipeline completed!") | |
logger.info(f"π View your model at: https://huggingface.co/{self.repo_name}") | |
return True | |
def parse_args(): | |
"""Parse command line arguments""" | |
parser = argparse.ArgumentParser(description='Recover and deploy trained model to Hugging Face Hub') | |
# Required arguments | |
parser.add_argument('model_path', type=str, help='Path to trained model directory') | |
parser.add_argument('repo_name', type=str, help='Hugging Face repository name (username/repo-name)') | |
# Optional arguments | |
parser.add_argument('--hf-token', type=str, default=None, help='Hugging Face token') | |
parser.add_argument('--private', action='store_true', help='Make repository private') | |
parser.add_argument('--no-quantize', action='store_true', help='Skip quantization') | |
parser.add_argument('--quant-types', nargs='+', | |
choices=['int8_weight_only', 'int4_weight_only', 'int8_dynamic'], | |
default=['int8_weight_only', 'int4_weight_only'], | |
help='Quantization types to apply') | |
parser.add_argument('--trackio-url', type=str, default=None, help='Trackio Space URL for logging') | |
parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for Trackio') | |
parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage') | |
parser.add_argument('--author-name', type=str, default=None, help='Author name for model card') | |
parser.add_argument('--model-description', type=str, default=None, help='Model description for model card') | |
return parser.parse_args() | |
def main(): | |
"""Main function""" | |
args = parse_args() | |
# Setup logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
) | |
logger.info("Starting model recovery and deployment pipeline") | |
# Initialize pipeline | |
try: | |
pipeline = ModelRecoveryPipeline( | |
model_path=args.model_path, | |
repo_name=args.repo_name, | |
hf_token=args.hf_token, | |
private=args.private, | |
quantize=not args.no_quantize, | |
quant_types=args.quant_types, | |
trackio_url=args.trackio_url, | |
experiment_name=args.experiment_name, | |
dataset_repo=args.dataset_repo, | |
author_name=args.author_name, | |
model_description=args.model_description | |
) | |
# Run complete pipeline | |
success = pipeline.run_complete_pipeline() | |
if success: | |
logger.info("β Model recovery and deployment completed successfully!") | |
return 0 | |
else: | |
logger.error("β Model recovery and deployment failed!") | |
return 1 | |
except Exception as e: | |
logger.error(f"β Error during model recovery: {e}") | |
return 1 | |
if __name__ == "__main__": | |
exit(main()) |