Spaces:
Build error
Build error
| #!/usr/bin/env python | |
| # This script creates a super tiny model that is useful inside tests, when we just want to test that | |
| # the machinery works, without needing to check the quality of the outcomes. | |
| # | |
| # usage: adjust the configs if wanted, but otherwise just run the script | |
| from pathlib import Path | |
| from types import SimpleNamespace | |
| import torchvision.transforms as transforms | |
| from PIL import Image | |
| from m4.models.vllama.modeling_vllama import VLlamaConfig, VLlamaForCausalLM | |
| from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask | |
| from m4.training.utils import get_tokenizer | |
| mname_tiny = "tiny-random-vllama-clip" | |
| path = Path(mname_tiny) | |
| path.mkdir(parents=True, exist_ok=True) | |
| # from the hardcoded https://github.com/huggingface/m4/blob/adf102f0000cb2632cd8a3ebb87398c65e448a97/m4/training/main.py#L80 | |
| additional_vocab_size = 2 | |
| config = VLlamaConfig() | |
| config.update( | |
| dict( | |
| ffn_dim=64, | |
| hidden_size=16, | |
| max_position_embeddings=128, | |
| num_attention_heads=4, | |
| num_hidden_layers=2, | |
| word_embed_proj_dim=16, | |
| max_new_tokens=100, | |
| use_resampler=True, | |
| resampler_depth=2, | |
| resampler_head_dim=8, | |
| resampler_n_heads=2, | |
| resampler_n_latents=16, | |
| vision_embed_dim=32, | |
| vision_image_size=30, | |
| vision_model_name="hf-internal-testing/tiny-random-clip", | |
| vision_model_params="{}", | |
| vocab_size=32000, | |
| additional_vocab_size=additional_vocab_size, | |
| ) | |
| ) | |
| # print(config) | |
| # can now modify config to say tiny values | |
| model = VLlamaForCausalLM.from_config(config) | |
| # print(model.config) | |
| # print(model) | |
| tokenizer_config = dict( | |
| tokenizer_add_special_tokens="{}", | |
| tokenizer_add_tokens=( | |
| '[AddedToken("<fake_token_around_image>", rstrip=False, lstrip=False), AddedToken("<image>", rstrip=False,' | |
| " lstrip=False)]" | |
| ), | |
| tokenizer_name="HuggingFaceM4/huggy-llama-tokenizer-7b", | |
| tokenizer_params='{"use_fast": True}', | |
| ) | |
| tokenizer_config = SimpleNamespace(**tokenizer_config) | |
| # print(tokenizer_config) | |
| tokenizer = get_tokenizer( | |
| tokenizer_name=tokenizer_config.tokenizer_name, | |
| tokenizer_add_tokens=tokenizer_config.tokenizer_add_tokens, | |
| tokenizer_add_special_tokens=tokenizer_config.tokenizer_add_special_tokens, | |
| tokenizer_params=tokenizer_config.tokenizer_params, | |
| additional_vocab_size=model.config.additional_vocab_size, | |
| model_vocab_size=model.config.vocab_size, | |
| ) | |
| assert "<image>" in tokenizer.get_vocab() | |
| # Test w/ one image and one text | |
| query = "<fake_token_around_image><image><fake_token_around_image>This is a picture of a cat." | |
| query_tokens = tokenizer(query, return_tensors="pt") | |
| num_images_per_ex = 1 | |
| pixel_values = transforms.ToTensor()(Image.new("RGB", (30, 30))).repeat(1, 1, 1, 1).unsqueeze(0) | |
| image_attention_mask, _ = image_attention_mask_for_packed_input_ids(query_tokens["input_ids"], tokenizer) | |
| image_attention_mask = incremental_to_binary_attention_mask(image_attention_mask, num_classes=num_images_per_ex) | |
| input = { | |
| "input_ids": query_tokens["input_ids"], | |
| "attention_mask": query_tokens["attention_mask"], | |
| "pixel_values": pixel_values, | |
| "pixel_values": pixel_values, | |
| "image_attention_mask": image_attention_mask, | |
| } | |
| # debug shapes | |
| # print(query_tokens["input_ids"].shape) | |
| # print(query_tokens["attention_mask"].shape) | |
| # print(pixel_values.shape) | |
| # print(image_attention_mask.shape) | |
| out_gen = model.generate(**input) | |
| text = tokenizer.batch_decode(out_gen) | |
| # print(text) | |
| # Save model + config + tokenizer | |
| model.half() # makes it smaller | |
| model.save_pretrained(path) | |
| tokenizer.save_pretrained(path) | |
| # test we can load it back | |
| model = VLlamaForCausalLM.from_pretrained(path) | |
| print(f"Generated {mname_tiny} - Upload the generated folder to the hub") | |