Spaces:

Tonic
/

SmolFactory

Running

SmolFactory / scripts /model_tonic /push_gpt_oss_to_huggingface.py

adds harmony format , configurable gpt-oss parameters, launch.sh logic , improved templates for legml gpt-oss training, dynamic results directory and improve model pushing

59e57ff about 1 month ago

raw

history blame

13.2 kB

	#!/usr/bin/env python3
	"""
	GPT-OSS Model Push Script
	Specialized script for pushing GPT-OSS models to Hugging Face Hub
	Handles LoRA weight merging and model card generation
	"""

	import os
	import sys
	import argparse
	import json
	from datetime import datetime
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from peft import PeftModel
	import torch

	def merge_lora_weights(checkpoint_path, base_model_name, output_path):
	"""Merge LoRA weights with base model for inference"""

	print(f"Loading base model: {base_model_name}")

	# Load base model
	model_kwargs = {
	"attn_implementation": "eager",
	"torch_dtype": "auto",
	"use_cache": True,
	"device_map": "auto"
	}
	base_model = AutoModelForCausalLM.from_pretrained(base_model_name, **model_kwargs).cuda()

	print(f"Loading LoRA weights from: {checkpoint_path}")

	# Load and merge LoRA weights
	model = PeftModel.from_pretrained(base_model, checkpoint_path)
	model = model.merge_and_unload()

	print(f"Saving merged model to: {output_path}")
	model.save_pretrained(output_path)

	# Save tokenizer
	tokenizer = AutoTokenizer.from_pretrained(base_model_name)
	tokenizer.save_pretrained(output_path)

	return model, tokenizer

	def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description, training_config_type=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
	"""Create a comprehensive model card for GPT-OSS models using generate_model_card.py"""

	try:
	# Import the model card generator
	import sys
	import os
	sys.path.append(os.path.join(os.path.dirname(__file__)))
	from generate_model_card import ModelCardGenerator, create_default_variables

	# Create generator
	generator = ModelCardGenerator()

	# Create variables for the model card
	variables = create_default_variables()

	# Update with GPT-OSS specific values
	variables.update({
	"repo_name": model_name,
	"model_name": model_name.split('/')[-1],
	"experiment_name": experiment_name or "gpt_oss_finetune",
	"dataset_repo": dataset_repo,
	"author_name": author_name or "GPT-OSS Fine-tuner",
	"model_description": model_description or "A fine-tuned version of OpenAI's GPT-OSS-20B model for multilingual reasoning tasks.",
	"training_config_type": training_config_type or "GPT-OSS Configuration",
	"base_model": "openai/gpt-oss-20b",
	"dataset_name": dataset_name or "HuggingFaceH4/Multilingual-Thinking",
	"trainer_type": trainer_type or "SFTTrainer",
	"batch_size": str(batch_size) if batch_size else "4",
	"learning_rate": str(learning_rate) if learning_rate else "2e-4",
	"max_epochs": str(max_epochs) if max_epochs else "1",
	"max_seq_length": str(max_seq_length) if max_seq_length else "2048",
	"hardware_info": "GPU (H100/A100)",
	"trackio_url": trackio_url or "N/A",
	"training_loss": "N/A",
	"validation_loss": "N/A",
	"perplexity": "N/A",
	"quantized_models": False
	})

	# Generate the model card
	model_card_content = generator.generate_model_card(variables)

	print("✅ Model card generated using generate_model_card.py")
	return model_card_content

	except Exception as e:
	print(f"❌ Failed to generate model card with generator: {e}")
	print("🔄 Falling back to original GPT-OSS model card")
	return _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description)

	def _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
	"""Create the original GPT-OSS model card as fallback"""

	card_content = f"""---
	language:
	- en
	- es
	- fr
	- it
	- de
	- zh
	- hi
	- ja
	- ko
	- ar
	license: mit
	tags:
	- gpt-oss
	- multilingual
	- reasoning
	- chain-of-thought
	- fine-tuned
	---

	# {model_name}

	## Model Description

	{model_description}

	This model is a fine-tuned version of OpenAI's GPT-OSS-20B model, optimized for multilingual reasoning tasks. It has been trained on the Multilingual-Thinking dataset to generate chain-of-thought reasoning in multiple languages.

	## Training Details

	- Base Model: openai/gpt-oss-20b
	- Training Dataset: HuggingFaceH4/Multilingual-Thinking
	- Training Method: LoRA (Low-Rank Adaptation)
	- Quantization: MXFP4
	- Experiment: {experiment_name}
	- Monitoring: {trackio_url}

	## Usage

	### Basic Usage

	```python
	from transformers import AutoTokenizer, AutoModelForCausalLM

	# Load model and tokenizer
	tokenizer = AutoTokenizer.from_pretrained("{model_name}")
	model = AutoModelForCausalLM.from_pretrained("{model_name}")

	# Example: Reasoning in Spanish
	messages = [
	{{"role": "system", "content": "reasoning language: Spanish"}},
	{{"role": "user", "content": "What is the capital of Australia?"}}
	]

	input_ids = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt"
	).to(model.device)

	output_ids = model.generate(input_ids, max_new_tokens=512)
	response = tokenizer.batch_decode(output_ids)[0]
	print(response)
	```

	### Multilingual Reasoning

	The model supports reasoning in multiple languages:

	- English
	- Spanish (Español)
	- French (Français)
	- Italian (Italiano)
	- German (Deutsch)
	- Chinese (中文)
	- Hindi (हिन्दी)
	- Japanese (日本語)
	- Korean (한국어)
	- Arabic (العربية)

	### System Prompt Format

	To control the reasoning language, use the system prompt:

	```
	reasoning language: [LANGUAGE]
	```

	Example:
	```
	reasoning language: German
	```

	## Training Configuration

	- LoRA Rank: 8
	- LoRA Alpha: 16
	- Target Modules: all-linear
	- Learning Rate: 2e-4
	- Batch Size: 4
	- Sequence Length: 2048
	- Mixed Precision: bf16

	## Dataset Information

	The model was trained on the Multilingual-Thinking dataset, which contains 1,000 examples of chain-of-thought reasoning translated into multiple languages.

	## Limitations

	- The model is designed for reasoning tasks and may not perform optimally on other tasks
	- Reasoning quality may vary across languages
	- The model inherits limitations from the base GPT-OSS-20B model

	## Citation

	If you use this model in your research, please cite:

	```bibtex
	@misc{{{model_name.replace("/", "_").replace("-", "_")},
	author = {{{author_name}}},
	title = {{{model_name}}},
	year = {{{datetime.now().year}}},
	publisher = {{Hugging Face}},
	journal = {{Hugging Face repository}},
	howpublished = {{\\url{{https://huggingface.co/{model_name}}}}}
	}}
	```

	## License

	This model is licensed under the MIT License.

	## Training Resources

	- Training Dataset: https://huggingface.co/datasets/{dataset_repo}
	- Training Monitoring: {trackio_url}
	- Base Model: https://huggingface.co/openai/gpt-oss-20b

	## Model Information

	- Architecture: GPT-OSS-20B with LoRA adapters
	- Parameters: 20B base + LoRA adapters
	- Context Length: 2048 tokens
	- Languages: 10+ languages supported
	- Task: Multilingual reasoning and chain-of-thought generation
	"""

	return card_content

	def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description, training_config_type=None, model_name=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
	"""Push GPT-OSS model to Hugging Face Hub"""

	print("=== GPT-OSS Model Push Pipeline ===")
	print(f"Checkpoint: {checkpoint_path}")
	print(f"Repository: {repo_name}")
	print(f"Experiment: {experiment_name}")
	print(f"Author: {author_name}")

	# Validate checkpoint path
	if not os.path.exists(checkpoint_path):
	raise FileNotFoundError(f"Checkpoint path not found: {checkpoint_path}")

	# Create temporary directory for merged model
	temp_output = f"/tmp/gpt_oss_merged_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
	os.makedirs(temp_output, exist_ok=True)

	try:
	# Merge LoRA weights with base model
	print("Merging LoRA weights with base model...")
	model, tokenizer = merge_lora_weights(
	checkpoint_path=checkpoint_path,
	base_model_name="openai/gpt-oss-20b",
	output_path=temp_output
	)

	# Create model card
	print("Creating model card...")
	model_card_content = create_gpt_oss_model_card(
	model_name=repo_name,
	experiment_name=experiment_name,
	trackio_url=trackio_url,
	dataset_repo=dataset_repo,
	author_name=author_name,
	model_description=model_description,
	training_config_type=training_config_type,
	dataset_name=dataset_name,
	batch_size=batch_size,
	learning_rate=learning_rate,
	max_epochs=max_epochs,
	max_seq_length=max_seq_length,
	trainer_type=trainer_type
	)

	# Save model card
	model_card_path = os.path.join(temp_output, "README.md")
	with open(model_card_path, "w", encoding="utf-8") as f:
	f.write(model_card_content)

	# Push to Hugging Face Hub
	print(f"Pushing model to: {repo_name}")

	# Set HF token
	os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token

	# Push using transformers
	from huggingface_hub import HfApi
	api = HfApi()

	# Create repository if it doesn't exist
	try:
	api.create_repo(repo_name, private=False, exist_ok=True)
	except Exception as e:
	print(f"Warning: Could not create repository: {e}")

	# Upload files
	print("Uploading model files...")
	api.upload_folder(
	folder_path=temp_output,
	repo_id=repo_name,
	repo_type="model"
	)

	print("✅ GPT-OSS model pushed successfully!")
	print(f"Model URL: https://huggingface.co/{repo_name}")

	# Clean up
	import shutil
	shutil.rmtree(temp_output)

	return True

	except Exception as e:
	print(f"❌ Error pushing GPT-OSS model: {e}")

	# Clean up on error
	if os.path.exists(temp_output):
	import shutil
	shutil.rmtree(temp_output)

	return False

	def main():
	parser = argparse.ArgumentParser(description="Push GPT-OSS model to Hugging Face Hub")
	parser.add_argument("checkpoint_path", help="Path to model checkpoint")
	parser.add_argument("repo_name", help="Hugging Face repository name")
	parser.add_argument("--token", required=True, help="Hugging Face token")
	parser.add_argument("--trackio-url", help="Trackio URL for model card")
	parser.add_argument("--experiment-name", help="Experiment name")
	parser.add_argument("--dataset-repo", help="Dataset repository")
	parser.add_argument("--author-name", help="Author name")
	parser.add_argument("--model-description", help="Model description")
	parser.add_argument("--training-config-type", help="Training configuration type")
	parser.add_argument("--model-name", help="Base model name")
	parser.add_argument("--dataset-name", help="Dataset name")
	parser.add_argument("--batch-size", help="Batch size")
	parser.add_argument("--learning-rate", help="Learning rate")
	parser.add_argument("--max-epochs", help="Maximum epochs")
	parser.add_argument("--max-seq-length", help="Maximum sequence length")
	parser.add_argument("--trainer-type", help="Trainer type")

	args = parser.parse_args()

	# Set defaults
	experiment_name = args.experiment_name or "gpt_oss_finetune"
	dataset_repo = args.dataset_repo or "HuggingFaceH4/Multilingual-Thinking"
	author_name = args.author_name or "GPT-OSS Fine-tuner"
	model_description = args.model_description or "A fine-tuned version of OpenAI's GPT-OSS-20B model for multilingual reasoning tasks."

	success = push_gpt_oss_model(
	checkpoint_path=args.checkpoint_path,
	repo_name=args.repo_name,
	hf_token=args.token,
	trackio_url=args.trackio_url,
	experiment_name=experiment_name,
	dataset_repo=dataset_repo,
	author_name=author_name,
	model_description=model_description,
	training_config_type=args.training_config_type,
	model_name=args.model_name,
	dataset_name=args.dataset_name,
	batch_size=args.batch_size,
	learning_rate=args.learning_rate,
	max_epochs=args.max_epochs,
	max_seq_length=args.max_seq_length,
	trainer_type=args.trainer_type
	)

	sys.exit(0 if success else 1)

	if __name__ == "__main__":
	main()