Kwai-Keye commited on Jun 26

Commit

03ea770

verified ·

1 Parent(s): 8bcbb54

Add model files

Browse files

Files changed (28) hide show

.gitattributes +6 -0
README.md +469 -3
added_tokens.json +34 -0
asset/architecture.png +3 -0
asset/keye_logo_2.png +3 -0
asset/post1.jpeg +0 -0
asset/post2.jpeg +3 -0
asset/pre-train.png +3 -0
asset/teaser.png +3 -0
chat_template.json +3 -0
config.json +72 -0
configuration_keye.py +243 -0
generation_config.json +13 -0
image_processing_keye.py +570 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +861 -0
modeling_keye.py +0 -0
preprocessor_config.json +18 -0
processing_keye.py +298 -0
processor_config.json +6 -0
special_tokens_map.json +25 -0
tokenizer.json +3 -0
tokenizer_config.json +282 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+asset/architecture.png filter=lfs diff=lfs merge=lfs -text
+asset/keye_logo_2.png filter=lfs diff=lfs merge=lfs -text
+asset/post2.jpeg filter=lfs diff=lfs merge=lfs -text
+asset/pre-train.png filter=lfs diff=lfs merge=lfs -text
+asset/teaser.png filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,469 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+language:
+- en
+pipeline_tag: image-text-to-text
+tags:
+- multimodal
+library_name: transformers
+---
+# Kwai Keye-VL
+<div align="center">
+  <img src="asset/keye_logo_2.png" width="100%" alt="Kwai Keye-VL Logo">
+</div>
+<font size=3><div align='center' >  [[🍎 Home Page](https://kwai-keye.github.io/)] [[📖 Technical Report]()] [[📊 Models](https://huggingface.co/Kwai-Keye)] </div></font>
+## 🔥 News
+* **`2025.06.26`** 🌟 We are very proud to launch **Kwai Keye-VL**, a cutting-edge multimodal large language model meticulously crafted by the **Kwai Keye Team** at [Kuaishou](https://www.kuaishou.com/). As a cornerstone AI product within Kuaishou's advanced technology ecosystem, Keye excels in video understanding, visual perception, and reasoning tasks, setting new benchmarks in performance. Our team is working tirelessly to push the boundaries of what's possible, so stay tuned for more exciting updates!
+<div align="center">
+  <img src="asset/teaser.png" width="100%" alt="Kwai Keye-VL Performance">
+</div>
+## Quickstart
+Below, we provide simple examples to show how to use Kwai Keye-VL with 🤗 Transformers.
+The code of Kwai Keye-VL has been in the latest Hugging face transformers and we advise you to build from source with command:
+```
+pip install git+https://github.com/huggingface/transformers accelerate
+```
+or you might encounter the following error:
+```
+KeyError: 'Keye-VL'
+```
+We offer a toolkit to help you handle various types of visual input more conveniently, as if you were using an API. This includes base64, URLs, and interleaved images and videos. You can install it using the following command:
+```bash
+# It's highly recommanded to use `[decord]` feature for faster video loading.
+pip install keye-vl-utils[decord]==1.0.0
+```
+If you are not using Linux, you might not be able to install `decord` from PyPI. In that case, you can use `pip install keye-vl-utils` which will fall back to using torchvision for video processing. However, you can still [install decord from source](https://github.com/dmlc/decord?tab=readme-ov-file#install-from-source) to get decord used when loading video.
+### Using 🤗  Transformers to Chat
+Here we show a code snippet to show you how to use the chat model with `transformers` and `keye_vl_utils`:
+```python
+from transformers import AutoModel, AutoTokenizer, AutoProcessor
+from keye_vl_utils import process_vision_info
+# default: Load the model on the available device(s)
+model_path = "Kwai-Keye/Keye-VL-8B-Preview"
+model = AutoModel.from_pretrained(
+    model_path, torch_dtype="auto", device_map="auto", attn_implementation="flash_attention_2", trust_remote_code=True,
+).to('cuda')
+# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+# model = KeyeForConditionalGeneration.from_pretrained(
+#     "Kwai-Keye/Keye-VL-8B-Preview",
+#     torch_dtype=torch.bfloat16,
+#     attn_implementation="flash_attention_2",
+#     device_map="auto",
+# )
+# default processer
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+# The default range for the number of visual tokens per image in the model is 4-16384.
+# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
+# min_pixels = 256*28*28
+# max_pixels = 1280*28*28
+# processor = AutoProcessor.from_pretrained(model_pat, min_pixels=min_pixels, max_pixels=max_pixels, trust_remote_code=True)
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "https://s1-11508.kwimgs.com/kos/nlav11508/mllm_all/ziran_jiafeimao_11.jpg",
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+# Preparation for inference
+text = processor.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
+```
+</details>
+<details>
+<summary>Video inference</summary>
+```python
+# Messages containing a images list as a video and a text query
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "video": [
+                    "file:///path/to/frame1.jpg",
+                    "file:///path/to/frame2.jpg",
+                    "file:///path/to/frame3.jpg",
+                    "file:///path/to/frame4.jpg",
+                ],
+            },
+            {"type": "text", "text": "Describe this video."},
+        ],
+    }
+]
+# Messages containing a local video path and a text query
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "video": "file:///path/to/video1.mp4",
+                "max_pixels": 360 * 420,
+                "fps": 1.0,
+            },
+            {"type": "text", "text": "Describe this video."},
+        ],
+    }
+]
+# Messages containing a video url and a text query
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "video": "http://s2-11508.kwimgs.com/kos/nlav11508/MLLM/videos_caption/98312843263.mp4",
+            },
+            {"type": "text", "text": "Describe this video."},
+        ],
+    }
+]
+#In Keye-VL, frame rate information is also input into the model to align with absolute time.
+# Preparation for inference
+text = processor.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+    **video_kwargs,
+)
+inputs = inputs.to("cuda")
+# Inference
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
+```
+Video URL compatibility largely depends on the third-party library version. The details are in the table below. change the backend by `FORCE_KEYEVL_VIDEO_READER=torchvision` or `FORCE_KEYEVL_VIDEO_READER=decord` if you prefer not to use the default one.
+| Backend     | HTTP | HTTPS |
+|-------------|------|-------|
+| torchvision >= 0.19.0 | ✅  | ✅   |
+| torchvision < 0.19.0  | ❌  | ❌   |
+| decord      | ✅  | ❌   |
+</details>
+<details>
+<summary>Batch inference</summary>
+```python
+# Sample messages for batch inference
+messages1 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": "file:///path/to/image1.jpg"},
+            {"type": "image", "image": "file:///path/to/image2.jpg"},
+            {"type": "text", "text": "What are the common elements in these pictures?"},
+        ],
+    }
+]
+messages2 = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Who are you?"},
+]
+# Combine messages for batch processing
+messages = [messages1, messages2]
+# Preparation for batch inference
+texts = [
+    processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
+    for msg in messages
+]
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=texts,
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+# Batch Inference
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_texts = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_texts)
+```
+</details>
+### More Usage Tips
+For input images, we support local files, base64, and URLs. For videos, we currently only support local files.
+```python
+# You can directly insert a local file path, a URL, or a base64-encoded image into the position where you want in the text.
+## Local file path
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": "file:///path/to/your/image.jpg"},
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+## Image URL
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": "http://path/to/your/image.jpg"},
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+## Base64 encoded image
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": "data:image;base64,/9j/..."},
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+```
+#### Image Resolution for performance boost
+The model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs, such as a token count range of 256-1280, to balance speed and memory usage.
+```python
+min_pixels = 256 * 28 * 28
+max_pixels = 1280 * 28 * 28
+processor = AutoProcessor.from_pretrained(
+    "Kwai-Keye/Keye-VL-8B-Preview", min_pixels=min_pixels, max_pixels=max_pixels
+)
+```
+Besides, We provide two methods for fine-grained control over the image size input to the model:
+1. Define min_pixels and max_pixels: Images will be resized to maintain their aspect ratio within the range of min_pixels and max_pixels.
+2. Specify exact dimensions: Directly set `resized_height` and `resized_width`. These values will be rounded to the nearest multiple of 28.
+```python
+# min_pixels and max_pixels
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "file:///path/to/your/image.jpg",
+                "resized_height": 280,
+                "resized_width": 420,
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+# resized_height and resized_width
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "file:///path/to/your/image.jpg",
+                "min_pixels": 50176,
+                "max_pixels": 50176,
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+```
+## 👀 Architecture and Training Strategy
+<div align="center">
+  <img src="asset/architecture.png" width="100%" alt="Kwai Keye Architecture">
+  <i> The Kwai Keye-VL model architecture is based on the Qwen3-8B language model and incorporates a vision encoder initialized from the open-source SigLIP. It supports native dynamic resolution, preserving the original aspect ratio of images by dividing each into a 14x14 patch sequence. A simple MLP layer then maps and merges the visual tokens. The model uses 3D RoPE for unified processing of text, image, and video information, establishing a one-to-one correspondence between position encoding and absolute time to ensure precise perception of temporal changes in video information.</i>
+</div>
+### 🌟 Pre-Train
+<div align="center">
+  <img src="asset/pre-train.png" width="100%" alt="Kwai Keye Pretraining">
+  <i>The Kwai Keye pre-training pipeline, featuring a four-stage progressive strategy: Image-Text Matching, ViT-LLM Alignment, Multi-task Pre-training, and Annealing with model merging.</i>
+</div>
+<details>
+  <summary>More Details</summary>
+  #### Pre-training Data: Massive, High-Quality, Diverse
+  - **Diversity**: Includes image-text pairs, videos, pure text, etc., with tasks such as fine-grained description, OCR, Q&A, localization, and more.
+  - **High Quality**: Data is filtered using CLIP scores and VLM discriminators, and MinHASH is used for deduplication to prevent data leakage.
+  - **Self-Built Datasets**: High-quality internal datasets are specifically constructed, especially for detailed captions and Chinese OCR, to compensate for the shortcomings of open-source data.
+  #### Training Process: Four-Stage Progressive Optimization
+  Kwai Keye-VL adopts a four-stage progressive training strategy:
+  - **Stage 0 (Visual Pre-training)**: Continuously pre-trains the visual encoder to adapt to internal data distribution and support dynamic resolution.
+  - **Stage 1 (Cross-Modal Alignment)**: Freezes the backbone model and trains only the MLP to establish robust image-text alignment at low cost.
+  - **Stage 2 (Multi-Task Pre-training)**: Unlocks all parameters to comprehensively enhance the model's visual understanding capabilities.
+  - **Stage 3 (Annealing Training)**: Fine-tunes with high-quality data to further improve the model's fine-grained understanding capabilities.
+  Finally, Kwai Keye-VL explores isomorphic heterogeneous fusion technology by averaging parameters of annealed training models with different data ratios, reducing model bias while retaining multidimensional capabilities, thereby enhancing the model's robustness.
+</details>
+### 🌟 Post-Train
+The post-training phase of Kwai Keye is meticulously designed into two phases with five stages, aiming to comprehensively enhance the model's performance, especially its reasoning ability in complex tasks. This is a key breakthrough for achieving advanced cognitive functions.
+#### Stage I. No-Reasoning Training: Strengthening Basic Performance
+<div align="center">
+  <img src="asset/post1.jpeg" width="100%" alt="Kwai Keye Post-Training">
+  <i>This phase focuses on the model's basic performance and stability in non-reasoning scenarios.</i>
+</div>
+<details>
+  <summary>More Details</summary>
+- **Stage II.1: Supervised Fine-Tuning (SFT)**
+  - Data Composition: Includes 5 million multimodal data, built on a diverse task classification system (70,000 tasks) using the self-developed TaskGalaxy framework. High-difficulty data is selected by multimodal large models and manually annotated to ensure data quality and challenge.
+- **Stage II.2: Mixed Preference Optimization (MPO)**
+  - Data Composition: Comprises open-source data and pure text preference data. Bad cases from the SFT model are used as quality prompts, and preference data is generated through rejection sampling using Qwen2.5VL 72B and SFT models, with manual scoring and ranking.
+</details>
+#### Stage II. Reasoning Training: Core Breakthrough for Complex Cognition
+<div align="center">
+  <img src="asset/post2.jpeg" width="100%" alt="Kwai Keye Post-Training">
+  <br>
+  <i>This phase is the highlight and major contribution of the Kwai Keye training process. By introducing a mix-mode Chain of Thought (CoT) and multi-thinking mode reinforcement learning (RL) mechanisms, it significantly enhances the model's multimodal perception, reasoning, and think-with-image capabilities, enabling it to handle more complex, multi-step tasks.</i>
+</div>
+<details>
+  <summary>More Details</summary>
+- **Step II.1: CoT Cold-Start**
+  - Objective: Cold-start the model's chain of thought reasoning ability, allowing it to mimic human step-by-step thinking.
+  - Data Composition: Combines non-reasoning data (330,000), reasoning data (230,000), auto-reasoning data (20,000), and agentic reasoning data (100,000) to teach the model different modes.
+    - Thinking Data: Focuses on high-difficulty perception and reasoning scenarios like math, science, charts, complex Chinese, and OCR, using multimodal large models for multiple sampling and evaluation to build over 70,000 complex thought chain data.
+    - Pure Text Data: Constructs a pure text long thought chain dataset from dimensions like code, math, science, instruction following, and general reasoning tasks.
+    - Auto-Think Data: Automatically selects "think" or "no_think" modes based on the complexity of prompts, enabling adaptive reasoning mode switching.
+    - Think with Image Data: 100,000 agent data entries, asking Qwen 2.5 VL-72B if image operations (e.g., cropping, rotating, enhancing contrast) are needed to simplify problems or improve answer quality, combined with external sandbox code execution to empower the model to solve problems by writing code to manipulate images or perform mathematical calculations.
+  - Training Strategy: Trains with a mix of four modes to achieve cold-start in different reasoning modes.
+- **Step II.2: CoT-Mix RL**
+  - Objective: Deeply optimize the model's comprehensive abilities in multimodal perception, reasoning, pure text math, short video understanding, and agentic tasks through reinforcement learning based on the chain of thought, making the reasoning process more robust and efficient.
+  - Data Composition: Covers complex tasks from multimodal perception (complex text recognition, object counting), multimodal reasoning, high-difficulty math problems, short video content understanding to Think with Image.
+  - Training Strategy: Uses a mix-mode GRPO algorithm for reinforcement learning, where reward signals evaluate both the correctness of results and the consistency of the process and results, ensuring synchronized optimization of reasoning processes and final outcomes.
+- **Step II.2: Iterative Alignment**
+  - Objective: Address common issues like repetitive crashes and poor logic in model-generated content, and enable spontaneous reasoning mode selection to enhance final performance and stability.
+  - Data Composition: Constructs preference data through Rejection Fine-Tuning (RFT), combining rule-based scoring (judging repetition, instruction following, etc.) and model scoring (cognitive scores provided by large models) to rank various model responses, building a high-quality preference dataset.
+  - Training Strategy: Multi-round iterative optimization with the constructed "good/bad" preference data pairs through the MPO algorithm. This aims to correct model generation flaws and ultimately enable it to intelligently and adaptively choose whether to activate deep reasoning modes based on problem complexity.
+</details>
+## 📈 Experimental Results
+![image](https://github.com/user-attachments/assets/a27cc0b8-e511-4879-969a-b6bc90f61c7e)
+1. Keye-VL-8B establishes itself with powerful, state-of-the-art perceptual abilities that are competitive with leading models.
+2. Keye-VL-8B demonstrates exceptional proficiency in video understanding. Across a comprehensive suite of authoritative public video benchmarks, including Video-MME, Video-MMMU, TempCompass, LongVideoBench, and MMVU, the model's performance significantly surpasses that of other top-tier models of a comparable size.
+3. In evaluation sets that require complex logical reasoning and mathematical problem-solving, such as WeMath, MathVerse, and LogicVista, Kwai Keye-VL-8B displays a strong performance curve. This highlights its advanced capacity for logical deduction and solving complex quantitative problems.
+## Requirements
+The code of Kwai Keye-VL has been in the latest Hugging face transformers and we advise you to build from source with command:
+```
+pip install git+https://github.com/huggingface/transformers accelerate
+```
+or you might encounter the following error:
+```
+KeyError: 'Keye-VL'
+```
+## ✒️ Citation
+If you find our work helpful for your research, please consider citing our work.
+```bibtex
+@misc{Keye-VL-8B-Preview,
+    title = {Keye-VL-8B-Preview},
+    url = {https://github.com/Kwai-Keye/Keye},
+    author = {Keye Team},
+    month = {June},
+    year = {2025}
+}
+```
+## Acknowledgement
+Kwai Keye-VL is developed based on the codebases of the following projects: [SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384), [Qwen3](https://github.com/QwenLM/Qwen3), [Qwen2.5-VL](https://github.com/QwenLM/Qwen2.5-VL), [VLMEvalKit](https://github.com/open-compass/VLMEvalKit). We sincerely thank these projects for their outstanding work.

added_tokens.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|clip_time_end|>": 151674,
+  "<|clip_time_start|>": 151673,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|ocr_text_end|>": 151672,
+  "<|ocr_text_start|>": 151671,
+  "<|point_end|>": 151670,
+  "<|point_start|>": 151669,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

asset/architecture.png ADDED Viewed

Git LFS Details

SHA256: d691a2b0ce5818fb2583bf990e2c83c2c5e4bce33b5146e9da6f35b73a564781
Pointer size: 132 Bytes
Size of remote file: 1.19 MB

asset/keye_logo_2.png ADDED Viewed

Git LFS Details

SHA256: 550ccafbb3f2dc4a7223d836060fddd747cae3a4e4c36b94b332e4553813fec5
Pointer size: 132 Bytes
Size of remote file: 1.66 MB

asset/post1.jpeg ADDED Viewed

asset/post2.jpeg ADDED Viewed

Git LFS Details

SHA256: 078d089db23ebba73e7f11628102032da14ac723c1457725cdb4642f16c8376a
Pointer size: 131 Bytes
Size of remote file: 136 kB

asset/pre-train.png ADDED Viewed

Git LFS Details

SHA256: 7f920c5545c11d2284adb0fecc91e2f3cb07869145f4c65a71fc1731982b1d4e
Pointer size: 131 Bytes
Size of remote file: 346 kB

asset/teaser.png ADDED Viewed

Git LFS Details

SHA256: 5692e80bc705058d5b5a56c5031e42567af668129416f1aa9976d09ad2b0e1ee
Pointer size: 132 Bytes
Size of remote file: 4.54 MB

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

config.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "_commit_hash": null,
+  "auto_map": {
+    "AutoConfig": "configuration_keye.KeyeConfig",
+    "AutoModel": "modeling_keye.KeyeForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_keye.KeyeForConditionalGeneration"
+  },
+  "architectures": [
+    "KeyeForConditionalGeneration"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "vision_start_token_id": 151652,
+  "vision_end_token_id": 151653,
+  "vision_token_id": 151654,
+  "image_token_id": 151655,
+  "video_token_id": 151656,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "Keye",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.41.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "initializer_factor": 1.0,
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "add_cross_attention": false,
+    "architectures": [
+      "SiglipVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_keye.KeyeVisionConfig",
+      "AutoModel": "modeling_keye.SiglipVisionModel"
+    },
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "tokens_per_second": 2,
+    "temporal_patch_size": 2
+  },
+  "rope_scaling": {
+    "type": "mrope",
+    "mrope_section": [
+      16,
+      24,
+      24
+    ]
+  },
+  "vocab_size": 151936
+}

configuration_keye.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# coding=utf-8
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+class KeyeVisionConfig(PretrainedConfig):
+    model_type = "Keye"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        tokens_per_second=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+class KeyeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`KeyeForConditionalGeneration`].
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 152064):
+            Vocabulary size of the Keye model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`KeyeForConditionalGeneration`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 29568):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 80):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 80):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        vision_config (`Dict`, *optional*):
+            The config for the visual encoder initialization.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+    ```python
+    >>> from transformers import KeyeForConditionalGeneration, KeyeConfig
+    >>> # Initializing a Keye style configuration
+    >>> configuration = KeyeConfig()
+    >>> # Initializing a model from the Keye style configuration
+    >>> model = KeyeForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "Keye"
+    sub_configs = {"vision_config": KeyeVisionConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Keye`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=8192,
+        intermediate_size=29568,
+        num_hidden_layers=80,
+        num_attention_heads=64,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=80,
+        attention_dropout=0.0,
+        vision_config=None,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations
+        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
+        # TODO: @raushan update config in the hub
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+__all__ = ["KeyeConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "bos_token_id": 151643,
+    "do_sample": true,
+    "eos_token_id": [
+        151645,
+        151643
+    ],
+    "pad_token_id": 151643,
+    "temperature": 0.6,
+    "top_k": 20,
+    "top_p": 0.95,
+    "transformers_version": "4.51.0"
+}

image_processing_keye.py ADDED Viewed

	@@ -0,0 +1,570 @@

+# coding=utf-8
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Keye."""
+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from torchvision.transforms import functional as TF
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+ImageInput = Union[
+    "PIL.Image.Image",
+    np.ndarray,
+    "torch.Tensor",
+    List["PIL.Image.Image"],
+    List[np.ndarray],
+    List["torch.Tensor"],
+]  # noqa
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "torch.Tensor",
+    List["np.ndarray"],
+    List["torch.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarrray"]],
+    List[List["torch.Tensor"]],
+]  # noqa
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if (
+        isinstance(images, (list, tuple))
+        and isinstance(images[0], (list, tuple))
+        and is_valid_image(images[0][0])
+    ):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+def adjust_size(size, patch_size):
+    num_patches = size // patch_size
+    if num_patches % 2 != 0:  # 如果是奇数，减1
+        num_patches -= 1
+    return num_patches * patch_size
+def make_batched_videos(videos) -> List[VideoInput]:
+    if (
+        isinstance(videos, (list, tuple))
+        and isinstance(videos[0], (list, tuple))
+        and is_valid_image(videos[0][0])
+    ):
+        return videos
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+    raise ValueError(f"Could not make batched video from {videos}")
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 56 * 56,
+    max_pixels: int = 14 * 14 * 4096,
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    # if height < factor or width < factor:
+    #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    # if int(height < factor//4) + int(width < factor//4):
+    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
+    if height < factor:
+        print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
+        width = round((width * factor) / height)
+        height = factor
+    if width < factor:
+        print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
+        height = round((height * factor) / width)
+        width = factor
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+class SiglipImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Siglip image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 1,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
+        self.do_convert_rgb = do_convert_rgb
+    def mvit_rescale(self, image: Image.Image, merge_size: int = 2) -> Image.Image:
+        try:
+            w, h = image.size
+        except:
+            raise ValueError(str((type(image), image)))
+        patch_size = self.patch_size
+        if (w // patch_size) * (h // patch_size) > self.in_token_limit:
+            scale = math.sqrt(
+                self.in_token_limit / ((w // patch_size) * (h // patch_size))
+            )
+            new_w, new_h = int(w * scale), int(h * scale)
+            image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
+        if self.pad_input:
+            new_w, new_h = image.size
+            pad_size_h = merge_size * patch_size
+            pad_size_w = merge_size * patch_size
+            pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
+            pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
+            image = TF.pad(image, (0, 0, pad_w, pad_h))
+        else:
+            new_w, new_h = image.size
+            new_w = new_w - new_w % patch_size
+            new_h = new_h - new_h % patch_size
+            new_w = adjust_size(new_w, patch_size)
+            new_h = adjust_size(new_h, patch_size)
+            image = TF.center_crop(image, (new_h, new_w))
+        w, h = image.size
+        if w // patch_size >= 512 or h // patch_size >= 512:
+            new_h = min(patch_size * 510, h)
+            new_w = min(patch_size * 510, w)
+            image = TF.center_crop(image, (new_h, new_w))
+            # raise ValueError("Exceed pos emb")
+        return image
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            # image = self.mvit_rescale(image, merge_size=self.merge_size)
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = resize(
+                    image,
+                    size=(resized_height, resized_width),
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
+            if do_rescale:
+                image = self.rescale(
+                    image, scale=rescale_factor, input_data_format=input_data_format
+                )
+            if do_normalize:
+                image = self.normalize(
+                    image=image,
+                    mean=image_mean,
+                    std=image_std,
+                    input_data_format=input_data_format,
+                )
+            image = to_channel_dimension_format(
+                image, data_format, input_channel_dim=input_data_format
+            )
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        init_patches = patches
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h,
+            self.patch_size,
+            grid_w,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
+        assert self.temporal_patch_size == 1
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+        if videos is not None:
+            pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(video_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {
+                "pixel_values_videos": pixel_values,
+                "video_grid_thw": vision_grid_thws,
+            }
+        return BatchFeature(data=data, tensor_type=return_tensors)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56eea05562f4432cf7c368774bbaea5acb03dc0c7c37dd6c3217612e49a6557a
+size 4991719792

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11ca964a00a765e08a2fd2aa22da6e0214f26afa61705a91f4f1cf0327701259
+size 4983069200

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d68e46b47365ca5de86e565c6942bd4f3f858f1e56b8460371af352fa31cb907
+size 4915943752

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ee1065cbb3c636460aefaa9c6b00823b1682bf2cc1eac6b1cbc95bac0bad05c
+size 2503030568

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,861 @@

+{
+  "metadata": {
+    "total_size": 17393657472
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "mlp_AR.linear_1.bias": "model-00001-of-00004.safetensors",
+    "mlp_AR.linear_1.weight": "model-00001-of-00004.safetensors",
+    "mlp_AR.linear_2.bias": "model-00001-of-00004.safetensors",
+    "mlp_AR.linear_2.weight": "model-00001-of-00004.safetensors",
+    "mlp_AR.pre_norm.bias": "model-00001-of-00004.safetensors",
+    "mlp_AR.pre_norm.weight": "model-00001-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors",
+    "visual.vision_model.embeddings.packing_position_embedding.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.embeddings.position_embedding.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.attention.in_proj_bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.attention.in_proj_weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.attention.out_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.attention.out_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.layernorm.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.layernorm.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "visual.vision_model.head.probe": "model-00001-of-00004.safetensors",
+    "visual.vision_model.post_layernorm.bias": "model-00001-of-00004.safetensors",
+    "visual.vision_model.post_layernorm.weight": "model-00001-of-00004.safetensors"
+  }
+}

modeling_keye.py ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "min_pixels": 3136,
+  "max_pixels": 1003520,
+  "patch_size": 14,
+  "temporal_patch_size": 1,
+  "merge_size": 2,
+  "image_mean": [
+    0.5,0.5,0.5
+  ],
+  "image_std": [
+    0.5,0.5,0.5
+  ],
+  "processor_class": "KeyeProcessor",
+  "auto_map": {
+    "AutoProcessor": "processing_keye.KeyeProcessor",
+    "AutoImageProcessor": "image_processing_keye.SiglipImageProcessor"
+  }
+}

processing_keye.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# coding=utf-8
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+import numpy as np
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import (
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+    VideosKwargs,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+import torch
+ImageInput = Union[
+    "PIL.Image.Image",
+    np.ndarray,
+    "torch.Tensor",
+    List["PIL.Image.Image"],
+    List[np.ndarray],
+    List["torch.Tensor"],
+]  # noqa
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "torch.Tensor",
+    List["np.ndarray"],
+    List["torch.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarrray"]],
+    List[List["torch.Tensor"]],
+]  # noqa
+class KeyeVideosProcessorKwargs(VideosKwargs, total=False):
+    fps: Union[List[float], float]
+class KeyeProcessorKwargs(ProcessingKwargs, total=False):
+    videos_kwargs: KeyeVideosProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "videos_kwargs": {"fps": 2.0},
+    }
+class KeyeProcessor(ProcessorMixin):
+    r"""
+    [`KeyeProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`Qwen2TokenizerFast`]. See the
+    [`~KeyeProcessor.__call__`] and [`~KeyeProcessor.decode`] for more information.
+    Args:
+        image_processor ([`SiglipImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "image_std",
+        "min_pixels",
+        "image_mean",
+        "merge_size",
+        "image_processor_type",
+        "temporal_patch_size",
+        "patch_size",
+        "max_pixels",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(
+        self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
+    ):
+        self.image_token = (
+            "<|image_pad|>"
+            if not hasattr(tokenizer, "image_token")
+            else tokenizer.image_token
+        )
+        self.video_token = (
+            "<|video_pad|>"
+            if not hasattr(tokenizer, "video_token")
+            else tokenizer.video_token
+        )
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[KeyeProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            KeyeProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images=images, return_tensors="pt")
+            image_inputs["pixel_values"] = image_inputs["pixel_values"]
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+        if videos is not None:
+            # TODO: add video processing
+            videos_inputs = self.image_processor(
+                images=None, videos=videos, **output_kwargs["images_kwargs"]
+            )
+            video_grid_thw = videos_inputs["video_grid_thw"]
+            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
+            if isinstance(fps, (int, float)):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / fps
+                ] * len(video_grid_thw)
+            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / tmp for tmp in fps
+                ]
+            else:
+                raise ValueError(
+                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
+                )
+            videos_inputs.update(
+                {"second_per_grid_ts": torch.tensor(second_per_grid_ts)}
+            )
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+        if not isinstance(text, list):
+            text = [text]
+        if image_grid_thw is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        "<|placeholder|>"
+                        * (
+                            image_grid_thw[index].prod()
+                            // self.image_processor.merge_size
+                            // self.image_processor.merge_size
+                        ),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        if video_grid_thw is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.video_token,
+                        "<|placeholder|>"
+                        * (
+                            video_grid_thw[index].prod()
+                            // self.image_processor.merge_size
+                            // self.image_processor.merge_size
+                        ),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def post_process_image_text_to_text(
+        self,
+        generated_outputs,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(
+            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
+        )
+        return names_from_processor + ["second_per_grid_ts"]
+__all__ = ["KeyeProcessor", "KeyeProcessor_moonvit", "KeyeProcessor"]

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_keye.KeyeProcessor"
+  },
+  "processor_class": "KeyeProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ceaf87113caa06d8b2e2f6966ab11d12ac590cb887b64c591cae70ea89245f4
+size 11422655

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,282 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|point_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151670": {
+      "content": "<|point_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151671": {
+      "content": "<|ocr_text_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151672": {
+      "content": "<|ocr_text_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151673": {
+      "content": "<|clip_time_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151674": {
+      "content": "<|clip_time_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff