# Copyright 2020-2025 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This script generates tiny models used in the TRL library for unit tests. It pushes them to the Hub under the # `trl-internal-testing` organization. # This script is meant to be run when adding new tiny model to the TRL library. from huggingface_hub import HfApi, ModelCard from torch import nn from transformers import ( AutoProcessor, AutoTokenizer, BartConfig, BartModel, BloomConfig, BloomForCausalLM, CohereConfig, CohereForCausalLM, DbrxConfig, DbrxForCausalLM, DeepseekV3Config, DeepseekV3ForCausalLM, FalconMambaConfig, FalconMambaForCausalLM, Gemma2Config, Gemma2ForCausalLM, Gemma3Config, Gemma3ForConditionalGeneration, GemmaConfig, GemmaForCausalLM, GPT2Config, GPT2LMHeadModel, GPTNeoXConfig, GPTNeoXForCausalLM, GptOssConfig, GptOssForCausalLM, Idefics2Config, Idefics2ForConditionalGeneration, LlamaConfig, LlamaForCausalLM, LlamaForSequenceClassification, LlavaConfig, LlavaForConditionalGeneration, LlavaNextConfig, LlavaNextForConditionalGeneration, MistralConfig, MistralForCausalLM, OPTConfig, OPTForCausalLM, PaliGemmaConfig, PaliGemmaForConditionalGeneration, Phi3Config, Phi3ForCausalLM, Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Config, Qwen2ForCausalLM, Qwen2ForSequenceClassification, Qwen2VLConfig, Qwen2VLForConditionalGeneration, Qwen3Config, Qwen3ForCausalLM, Qwen3ForSequenceClassification, Qwen3MoeConfig, Qwen3MoeForCausalLM, SmolVLMConfig, SmolVLMForConditionalGeneration, T5Config, T5ForConditionalGeneration, ) ORGANIZATION = "trl-internal-testing" MODEL_CARD = """ --- library_name: transformers tags: [trl] --- # Tiny {model_class_name} This is a minimal model built for unit tests in the [TRL](https://github.com/huggingface/trl) library. """ api = HfApi() def push_to_hub(model, tokenizer, prefix=None, suffix=None): model_class_name = model.__class__.__name__ content = MODEL_CARD.format(model_class_name=model_class_name) model_card = ModelCard(content) if prefix is not None: model_class_name = f"{prefix}-{model_class_name}" repo_id = f"{ORGANIZATION}/{model_class_name}" if suffix is not None: repo_id += f"-{suffix}" if api.repo_exists(repo_id): print(f"Model {repo_id} already exists, skipping") else: model.push_to_hub(repo_id) tokenizer.push_to_hub(repo_id) model_card.push_to_hub(repo_id) def init_weights_tiny_model(model): """ Initialize tiny test models to avoid NaNs from uninitialized weights. Uses safe defaults: - Linear/Conv1d: Xavier uniform (weights), zero (biases) - Embedding: Normal(0, 0.02) - LayerNorm: Ones (weights), zero (biases) Args: model: PyTorch model (modified in-place) """ for module in model.modules(): if isinstance(module, nn.Linear): # Attention/MLP projections → Xavier or Normal if module.bias is not None: nn.init.zeros_(module.bias) nn.init.xavier_uniform_(module.weight) elif isinstance(module, nn.Embedding): # Token embeddings → GPT-style Normal nn.init.normal_(module.weight, mean=0.0, std=0.02) elif isinstance(module, nn.LayerNorm): # LayerNorm weights always 1, bias 0 nn.init.ones_(module.weight) if module.bias is not None: nn.init.zeros_(module.bias) elif isinstance(module, nn.Conv1d): # Convolutional layers → Xavier or Normal if module.bias is not None: nn.init.zeros_(module.bias) nn.init.xavier_uniform_(module.weight) # Decoder models for model_id, config_class, model_class, suffix in [ ("bigscience/bloomz-560m", BloomConfig, BloomForCausalLM, None), ("CohereForAI/aya-expanse-8b", CohereConfig, CohereForCausalLM, None), ("databricks/dbrx-instruct", DbrxConfig, DbrxForCausalLM, None), ("deepseek-ai/DeepSeek-R1", DeepseekV3Config, DeepseekV3ForCausalLM, None), # It's important to have R1-0528 as it doesn't have the same chat template ("deepseek-ai/DeepSeek-R1-0528", DeepseekV3Config, DeepseekV3ForCausalLM, "0528"), ("tiiuae/falcon-7b-instruct", FalconMambaConfig, FalconMambaForCausalLM, None), ("google/gemma-2-2b-it", Gemma2Config, Gemma2ForCausalLM, None), ("google/gemma-7b-it", GemmaConfig, GemmaForCausalLM, None), ("openai-community/gpt2", GPT2Config, GPT2LMHeadModel, None), ("EleutherAI/pythia-14m", GPTNeoXConfig, GPTNeoXForCausalLM, None), ("meta-llama/Meta-Llama-3-8B-Instruct", LlamaConfig, LlamaForCausalLM, "3"), ("meta-llama/Llama-3.1-8B-Instruct", LlamaConfig, LlamaForCausalLM, "3.1"), ("meta-llama/Llama-3.2-1B-Instruct", LlamaConfig, LlamaForCausalLM, "3.2"), ("mistralai/Mistral-7B-Instruct-v0.1", MistralConfig, MistralForCausalLM, "0.1"), ("mistralai/Mistral-7B-Instruct-v0.2", MistralConfig, MistralForCausalLM, "0.2"), ("facebook/opt-1.3b", OPTConfig, OPTForCausalLM, None), ("microsoft/Phi-3.5-mini-instruct", Phi3Config, Phi3ForCausalLM, None), ("Qwen/Qwen2.5-32B-Instruct", Qwen2Config, Qwen2ForCausalLM, "2.5"), ("Qwen/Qwen2.5-Coder-0.5B", Qwen2Config, Qwen2ForCausalLM, "2.5-Coder"), ("Qwen/Qwen3-8B", Qwen3Config, Qwen3ForCausalLM, None), ]: revision = "refs/pr/14" if model_id == "Qwen/Qwen3-8B" else "main" # chat template with {% generation %} tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) config = config_class( vocab_size=tokenizer.vocab_size + len(tokenizer.added_tokens_encoder.keys()), hidden_size=8, num_attention_heads=4, num_key_value_heads=2, num_hidden_layers=2, intermediate_size=32, ) model = model_class(config) init_weights_tiny_model(model) push_to_hub(model, tokenizer, "tiny", suffix) # MoE models for model_id, config_class, model_class, suffix in [ ("Qwen/Qwen3-30B-A3B", Qwen3MoeConfig, Qwen3MoeForCausalLM, None), ("openai/gpt-oss-20b", GptOssConfig, GptOssForCausalLM, None), ]: tokenizer = AutoTokenizer.from_pretrained(model_id) config = config_class( vocab_size=tokenizer.vocab_size + len(tokenizer.added_tokens_encoder.keys()), hidden_size=8, num_attention_heads=4, num_key_value_heads=2, num_hidden_layers=2, intermediate_size=32, num_experts=4, num_experts_per_tok=2, ) model = model_class(config) init_weights_tiny_model(model) push_to_hub(model, tokenizer, "tiny", suffix) # Two slightly bigger models, required for vLLM testing tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct") config = Qwen2Config( vocab_size=tokenizer.vocab_size + len(tokenizer.added_tokens_encoder.keys()), hidden_size=128, # increase hidden size so that hidden_size // num_attention_heads = 32, required for vLLM num_attention_heads=4, num_key_value_heads=2, num_hidden_layers=2, intermediate_size=32, ) model = Qwen2ForCausalLM(config) push_to_hub(model, tokenizer, "small", "2.5") tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B") config = Qwen3Config( vocab_size=tokenizer.vocab_size + len(tokenizer.added_tokens_encoder.keys()), hidden_size=128, # increase hidden size so that hidden_size // num_attention_heads = 32, required for vLLM num_attention_heads=4, num_key_value_heads=2, num_hidden_layers=2, intermediate_size=32, ) model = Qwen3ForCausalLM(config) push_to_hub(model, tokenizer, "small") # Reward models for model_id, config_class, model_class, suffix in [ ("meta-llama/Llama-3.2-1B-Instruct", LlamaConfig, LlamaForSequenceClassification, "3.2"), ("Qwen/Qwen2.5-32B-Instruct", Qwen2Config, Qwen2ForSequenceClassification, "2.5"), ("Qwen/Qwen3-4B", Qwen3Config, Qwen3ForSequenceClassification, None), ]: tokenizer = AutoTokenizer.from_pretrained(model_id) config = config_class( vocab_size=tokenizer.vocab_size + len(tokenizer.added_tokens_encoder.keys()), hidden_size=8, num_attention_heads=4, num_key_value_heads=2, num_hidden_layers=2, intermediate_size=32, num_labels=1, ) model = model_class(config) push_to_hub(model, tokenizer, "tiny", suffix) # Encoder-decoder models for model_id, config_class, model_class, suffix in [ ("facebook/bart-base", BartConfig, BartModel, None), ("google/flan-t5-small", T5Config, T5ForConditionalGeneration, None), ]: tokenizer = AutoTokenizer.from_pretrained(model_id) config = config_class( vocab_size=tokenizer.vocab_size + len(tokenizer.added_tokens_encoder.keys()), d_model=16, encoder_layers=2, decoder_layers=2, d_kv=2, d_ff=64, num_layers=6, num_heads=8, decoder_start_token_id=0, is_encoder_decoder=True, ) model = model_class(config) push_to_hub(model, tokenizer, "tiny", suffix) # Vision Language Models for model_id, config_class, model_class in [ ("google/gemma-3-4b-it", Gemma3Config, Gemma3ForConditionalGeneration), ("google/paligemma-3b-pt-224", PaliGemmaConfig, PaliGemmaForConditionalGeneration), ("HuggingFaceM4/idefics2-8b", Idefics2Config, Idefics2ForConditionalGeneration), ("HuggingFaceTB/SmolVLM2-2.2B-Instruct", SmolVLMConfig, SmolVLMForConditionalGeneration), ("llava-hf/llava-1.5-7b-hf", LlavaConfig, LlavaForConditionalGeneration), ("llava-hf/llava-v1.6-mistral-7b-hf", LlavaNextConfig, LlavaNextForConditionalGeneration), ("Qwen/Qwen2-VL-2B-Instruct", Qwen2VLConfig, Qwen2VLForConditionalGeneration), ("Qwen/Qwen2.5-VL-3B-Instruct", Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration), ]: processor = AutoProcessor.from_pretrained(model_id) kwargs = {} text_kwargs = {} vision_kwargs = {} if config_class == PaliGemmaConfig: kwargs["projection_dim"] = 8 if config_class in [LlavaConfig, LlavaNextConfig, PaliGemmaConfig]: vision_kwargs["projection_dim"] = 8 if config_class in [LlavaConfig, LlavaNextConfig]: vision_kwargs["image_size"] = 336 vision_kwargs["patch_size"] = 14 if config_class in [Qwen2VLConfig, Qwen2_5_VLConfig]: kwargs["vision_start_token_id"] = 151652 text_kwargs["rope_scaling"] = {"type": "mrope", "mrope_section": [1]} vision_kwargs["depth"] = 4 vision_kwargs["embed_dim"] = 64 config = config_class( text_config=dict( vocab_size=processor.tokenizer.vocab_size + len(processor.tokenizer.added_tokens_encoder), hidden_size=8, num_attention_heads=4, num_key_value_heads=2, num_hidden_layers=2, intermediate_size=32, **text_kwargs, ), vision_config=dict( hidden_size=16, num_attention_heads=4, num_hidden_layers=2, intermediate_size=32, **vision_kwargs, ), **kwargs, ) model = model_class(config) push_to_hub(model, processor, "tiny")