vxo

hyperclovax commited on 4 days ago

Commit

2c33877

verified ·

0 Parent(s):

Duplicate from naver-hyperclovax/HyperCLOVAX-SEED-Think-14B

Browse files

Co-authored-by: HyperCLOVA X (admin) <[email protected]>

Files changed (26) hide show

.gitattributes +35 -0
LICENSE +122 -0
README.md +603 -0
added_tokens.json +35 -0
config.json +45 -0
configuration_hyperclovax.py +235 -0
generation_config.json +8 -0
merges.txt +0 -0
model-00001-of-00012.safetensors +3 -0
model-00002-of-00012.safetensors +3 -0
model-00003-of-00012.safetensors +3 -0
model-00004-of-00012.safetensors +3 -0
model-00005-of-00012.safetensors +3 -0
model-00006-of-00012.safetensors +3 -0
model-00007-of-00012.safetensors +3 -0
model-00008-of-00012.safetensors +3 -0
model-00009-of-00012.safetensors +3 -0
model-00010-of-00012.safetensors +3 -0
model-00011-of-00012.safetensors +3 -0
model-00012-of-00012.safetensors +3 -0
model.safetensors.index.json +428 -0
modeling_hyperclovax.py +979 -0
special_tokens_map.json +86 -0
tokenizer.json +0 -0
tokenizer_config.json +501 -0
vocab.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,122 @@

+HyperCLOVA X SEED 14B Think Model License Agreement
+Model Release Date: July 22, 2025
+This HyperCLOVA X SEED 14B Think Model License Agreement (the “Agreement”) is a legal agreement between you and NAVER Corporation (“Naver Corp.”) and NAVER Cloud Corporation (“Naver Cloud Corp.”) (Naver Corp. and Naver Cloud Corp. are collectively referred to as “NAVER”) and governs your use of the Models that NAVER provides to You under this Agreement.
+NAVER Corp., as the holder of the intellectual property of the Model, and its affiliate, NAVER Cloud Corp., as the exclusive business operator of HyperCLOVA X, enter into this Agreement with you. NAVER and you are each a “party” and collectively the “parties.”
+By using, reproducing, modifying, distributing, performing or displaying any portion or element of the Model or Derivative Model, or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement. You represent to us that you are lawfully able to enter into contracts, and if you are entering into this Agreement for an entity, that you have legal authority to bind that entity.
+1. Definitions.
+1.1. "Affiliate” means any entity directly or indirectly controlling, controlled by or under common control with either party, where “control” means the possession, directly or indirectly, of the power to independently direct or cause the direction of the management and policies of an entity, whether through ownership of more than fifty percent (50%) of the stock or other equity interests entitled to vote for representation on its board of directors, or body performing similar functions, by contract or otherwise.
+1.2. “Derivative Model” means all (i) modifications to the Model, (ii) works based on the Model, or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of the Model, to that model in order to cause that model to perform similarly to the Model, including distillation methods that use intermediate data representations or methods based on the generation of synthetic data Outputs by the Model for training that Model. For clarity, Outputs are not deemed Derivative Model.
+1.3. “Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+1.4. “Model” means the foundational large language models and software and algorithms, including machine-learning model code and trained model weights distributed by NAVER.
+1.5. “Output” means the information content output of the Model or a Derivative Model that results from operating or otherwise using the Model or Derivative Model.
+2. Conditions for Use, License Grant and Restrictions
+2.1. Conditions for Use. The Model and any Derivative Model are subject to the terms of this Agreement and govern your use. If You institute copyright or patent litigation against any entity (including a crossclaim or counterclaim in a lawsuit) alleging that the Model or Derivative Model constitutes direct or contributory copyright or patent infringement, then any license granted to you under this Agreement for that Model or Derivative Model will terminate as of the date such litigation is filed. NAVER may update this Agreement to comply with legal and regulatory requirements any time and You agree to either comply with any updated license or cease your copying, use, and distribution of the Model and any Derivative Model.
+2.2. License Grant. Subject to the terms and conditions of this Agreement, NAVER hereby grants to you a non-exclusive, worldwide, non-transferable, revocable and royalty-free limited license under NAVER’s intellectual property or other rights owned by NAVER embodied in the Model to access, download, install, copy, use, reproduce, distribute, create derivative works of, and make modifications to the Model.
+2.3. Prohibited Use Policy. NAVER is committed to ensuring safety trust, and transparency in the development and use of AI technologies. Accordingly, your use of the Model and any Derivative Models is subject to the following conditions:
+(i) You must ensure that any product or service you develop, use, offer as a service, or distribute complies with all applicable laws and regulations, and is operated appropriately for the relevant industry or use case.
+(ii) You must comply with the Acceptable Use Policy applicable to the Model and any Derivative Models, which is attached hereto as Addendum A and incorporated by reference into this Agreement.
+(iii) NAVER expressly prohibits the use of its products or services for any purpose in violation of applicable law and regulation, including but not limited to:
+(a) illegal surveillance,
+(b) illegal collection or processing of biometric information without the consent of the subject which is required under applicable law, or
+(c) illegal harassment, abuse, threatening or bullying of individuals or groups of individuals or intentionally misleading or deceiving others.
+(iv) You must take reasonable measures to address unintended bias and to mitigate harm to others, including underrepresented or vulnerable groups.
+3. Redistribution.
+3.1. You may reproduce, distribute or make available the Model or Derivative Models thereof, or a product or service (including another AI model) that contains any of them, if you meet all of the following conditions: you must (i) include the Prohibited Use Policy referenced in Section 2.3. as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of the Model or Derivative Model and you must provide notice to subsequence users you distribute to the Model or Derivative Models are subject to the use restrictions in Section 2.3., (ii) provide all third party recipients of the Model or Derivative Models a copy of this Agreement, (iii) cause any modified files to carry prominent notices stating that you modified the files; (iv) include the following attribution notice within a “Notice” text file distributed as part of such copies: “HyperCLOVA X SEED 14B Think Model is licensed under the HyperCLOVA X SEED 14B Think Model License Agreement, Copyright © NAVER Corp. All Rights Reserved.”, and (v) prominently display “Powered by HyperCLOVA X” on a related website, user interface, blogpost, about page, or product documentation. If you use the Model or any Outputs of the Model to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “HyperCLOVA X” at the beginning of any such AI model name.
+3.2. You may add your own copyright statement to your modifications and, except as set forth in this Section, may provide additional or different license terms and conditions for use, reproduction, or distribution of your modifications, or for any such Derivative Models as a whole, provided your use, reproduction, and distribution of the Model or Derivative Models otherwise comply with the terms and conditions stated in this Agreement. Any additional or different terms and conditions you impose must not conflict with the terms of this Agreement.
+4. Additional Commercial Terms. If (i) as of the Model Release Date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s Affiliates, is greater than 10 million monthly active users in the preceding calendar month, or (ii) the Licensee or its Affiliate distributes or makes available any product or service, which is substantially similar to or directly competes with any product and service provided by NAVER, then the Licensee must request a license from NAVER. Such a license may be granted by NAVER at its sole discretion, and the Licensee is not authorized to exercise any rights under this Agreement unless and until NAVER expressly grants you such rights.
+5. Generated Output. NAVER claims no rights in Outputs you generate using the Model. You and your use are solely responsible for Outputs and their subsequent uses.
+6. DISCLAIMER OF WARRANTY. UNLESS REQUIRED BY APPLICABLE LAW, THE MODEL AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OR ANY KIND, AND NAVER DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE MODEL, DERIVATIVE MODELS, OUTPUTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE MODEL AND ANY OUTPUTS AND RESULTS AND YOUR EXERCISE OF PERMISSION UNDER THIS AGREEMENT.
+7. LIMITATION OF LIABILITY. IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, UNLESS REQUIRED BY APPLICABLE LAW (SUCH AS IN CASES OF DELIBERATE AND GROSSLY NEGLIGENT ACTS), WILL NAVER BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY, OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND, ARISING FROM OR RELATED TO THIS AGREEMENT, OR RESULTING FROM THE USE OR INABILITY TO USE THE MODEL, DERIVATIVE MODELS OR, OUTPUTS (INCLUDING, BUT NOT LIMITED TO, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGES, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES), EVEN IF NAVER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+8. Indemnity. You will indemnify and hold harmless NAVER from and against any claim by any third party arising out of or related to your use or distribution of the Model, Derivative Model or Outputs.
+9. Intellectual Property.
+9.1. This Agreement does not grant permission to use the trade names, trademarks, service marks, or product names of NAVER, except as required for reasonable and customary use in describing the origin of the Model and reproducing the content of the “Notice” text file.
+9.2. NAVER Corp. owns the Model and any Derivative Model created by NAVER Corp. Except as expressively granted in this Agreement, NAVER Corp. reserves all rights, interests and remedies in connection with the Model and Derivative Model created by NAVER Corp. and no other license or right is granted to you by implication, estoppel or otherwise. Subject to NAVER Corp.’s ownership of the Model and any Derivative Model made by or for NAVER Corp., with respect to any derivative works and modifications of the Model that are made by you, as between you and NAVER Corp., you are and will be the owner of such derivative works and modifications.
+10. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Model and will continue in full force and effect until terminated in accordance with the terms and conditions of this Agreement. NAVER may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Model and Derivative Model. Section 5, 6, 7 and 10 shall survive the termination of this Agreement.
+11. Governing Law and Jurisdiction.
+11.1. This Agreement will be governed by and construed in accordance with the laws of the Republic of Korea, without regard to its conflicts of laws principles.
+11.2. Any disputes, controversies, or claims arising out of or relating to this Agreement, including its existence, validity, interpretation, performance, breach, or termination, shall be referred to and finally resolved by arbitration administered by the Korean Commercial Arbitration Board (KCAB) in accordance with the International Arbitration Rules of the Korean Commercial Arbitration Board in force at the time of the commencement of the arbitration. The seat of arbitration shall be Seoul, Republic of Korea. The tribunal shall consist of one arbitrator. The language of the arbitration shall be English. Either party may seek interim or provisional relief from a court of competent jurisdiction and doing so shall not be considered a waiver of any provision in this section. The arbitral tribunal also has the authority to issue orders for interim or provisional relief.
+12. Modifications. NAVER reserves the right to modify or amend this Agreement at any time, in its sole discretion. Any modifications will be effective upon posting the updated Agreement on our website or through other means of communication. You are responsible for reviewing the Agreement periodically for changes.
+13. No Waiver. NAVER will not be treated as having waived any rights by not exercising (or delaying the exercise of) any rights under this Agreement.
+Addendum A – Acceptable Use Policy
+NAVER is committed to promoting safe and responsible use of its AI technologies, including the HyperCLOVA X SEED 14B Think Model (the “Model”). By accessing or using the Model and Derivative Model (Defined in the Model License Agreement) (the Model and Derivative Model are collectively referred to as the “Models”), you agree to this Acceptable Use Policy (“Policy”).
+We want everyone to use the Models safely, legally, and ethically. You agree that you will not use, or allow others to use, the Models to:
+1. Violate applicable laws or the rights of others, including by:
+a. Engaging in, promoting, contributing to, encouraging, planning, inciting, or furthering illegal or unlawful activity or content, such as:
+* Violence or terrorism
+* Exploitation or harm to children, including the creation or dissemination of child exploitative content
+* Human trafficking, exploitation, or sexual violence
+* The unlawful distribution of obscene or harmful material to minors, or failure to apply legally required age restrictions
+* Sexual solicitation or sexually exploitative behavior
+* Any other criminal activity
+b. Engaging in, promoting, inciting, or facilitating the harassment, abuse, threatening, or bullying of individuals or groups
+c. Engaging in, promoting, inciting, or facilitating discrimination or other unlawful or harmful conduct in the provision of employment, credit, housing, or access to essential goods and services
+d. Providing unauthorized or unlicensed professional services, including but not limited to financial, legal, medical/health, or related services
+e. Collecting, processing, disclosing, generating, or inferring private or sensitive personal information, including identity, health, or demographic data, unless lawfully permitted under applicable laws
+f. Infringing, misappropriating, or otherwise violating third-party rights, including through the generation or use of outputs derived from the Models
+g. Creating, generating, or facilitating malicious code, malware, or computer viruses, or interfering with the functioning, security, or integrity of a website, application, or system
+h. Intentionally bypassing or disabling usage restrictions, safety measures, or access controls imposed by NAVER
+2. Engage in or promote use cases that may pose a risk of death, bodily harm, or significant safety hazard to individuals, including use of the Models in connection with:
+a. Military, warfare, nuclear technology or espionage
+b. The development or distribution of firearms or illegal weapons
+c. Illegal drugs or regulated controlled substances
+d. Operation of critical infrastructure, transportation systems, or heavy machinery
+e. Content promoting self-harm, including suicide, or eating disorders
+f. Any other use intended to incite or cause physical harm
+3. Intentionally deceive or mislead others, including by:
+a. Generating, promoting, or disseminating fraudulent or misleading content
+b. Creating or sharing defamatory content
+c. Generating or distributing spam
+d. Impersonating another individual or entity without proper authorization
+e. Representing Model output as human-generated
+f. Generating or enabling fake online engagement, such as fake reviews or fake users
+4. Fail to disclose to end users any known risks or limitations of an AI system that incorporates the Models.
+5. Use the Models in conjunction with third-party tools, models, or software designed to generate unlawful content or conduct, or falsely represent outputs from such tools as associated with NAVER or HyperCLOVA X.
+If you become aware of a violation of this Policy, a bug, or any behavior that could result in a breach of this Policy, please report it to us:
+Reporting risky outputs: [email protected]
+Reporting policy violations or unauthorized use: [email protected]

README.md ADDED Viewed

	@@ -0,0 +1,603 @@

+---
+license: other
+license_name: hyperclovax-seed
+license_link: LICENSE
+library_name: transformers
+---
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/6512d9827fccffe1e9e28fa7/7BT1W9eHLQjRCCENwcXmE.png)
+## Overview
+HyperCLOVA X SEED 14B Think is a next-generation language model that moves beyond the conventional approach of simply increasing model size to improve performance. It combines [HyperCLOVA X’s lightweighting technology](https://tinyurl.com/y3hrfz67) for building high-efficiency LLMs with advanced reasoning capabilities. Its development relied on two key technologies: (1) Pruning & Knowledge Distillation, which achieves both compactness and high performance, and (2) a Reinforcement Learning (RL) pipeline, which maximizes reasoning ability. By pruning low-importance parameters and distilling knowledge from a large model into a smaller one, training costs have been significantly reduced. On top of this, [the latest RL recipe validated in HyperCLOVA X Think](https://arxiv.org/pdf/2506.22403) is applied in a multi-stage process: (1) Supervised Fine-Tuning (SFT), (2) Reinforcement Learning with Verifiable Rewards (RLVR), (3) Length Controllability (LC) for reasoning path optimization, and (4) a joint training of Reinforcement Learning from Human Feedback (RLHF) and RLVR.
+It is a considerable challenge to equip a pruned, knowledge-distilled model with reasoning capabilities, since reductions in training costs and model size often degrade reasoning performance. However, through extensive research experience and persistent trial and error, the HyperCLOVA X team has succeeded in lowering training costs while maintaining reasoning performance comparable to that of larger, resource-intensive models.
+## Basic Information
+- **Architecture** : Transformer-based architecture with Peri-Layer Normalization and Maximal Update Parameterization(μP) (Dense Model)
+- **Parameters** : 14.74B
+- **Input/Output Format** (Input/Output) : Text / Text
+- **Context Length** : 32k
+## Training Cost
+`HyperCLOVA X SEED 14B Think` was trained at a significantly lower cost compared to high-performance external models of similar scale. By utilizing HCX’s lightweight training pipeline, it was trained at approximately **52.60×** lower cost than `Qwen2.5-14B` and **91.38×** lower cost than `Qwen3-14B`.
+| Model (Base)                    | GPU Hours (A100-80GB, MFU 50%)     |
+| ------------------------------- | ---------------------------------- |
+| **HyperCLOVA X SEED 14B Think** | **68,049**                         |
+| Qwen2.5-0.5B                    | 169,257                            |
+| Qwen2.5-1.5B                    | 449,643                            |
+| Qwen3-0.6B                      | 602,460                            |
+| Qwen3-1.7B                      | 1,063,991                          |
+| HyperCLOVA X Think              | 2,197,732                          |
+| **Qwen2.5-14B**                 | **3,603,432**                      |
+| Qwen3-8B                        | 3,993,607                          |
+| **Qwen3-14B**                   | **6,267,077**                      |
+| Qwen3-32B                       | 14,108,748                         |
+## Benchmarks
+Compared to global models of a similar scale, such as Qwen3 14B, HyperCLOVA X SEED 14B Think demonstrates superior performance in Korean language and cultural understanding, while showing competitive performance in math and coding tasks, which are directly or indirectly related to agent capabilities. This trend remains consistent even when compared with larger models like Qwen3 32B and LG Exaone-Deep 32B.
+### Backbone Benchmarks Performance Comparison (Non-think)
+**Korean/Korea Culture**
+| Model                           | Average | CLIcK  | HAERAE-Bench | KOBEST | KorMedMCQA | KMMLU  | KoBigBench | KoCommonGEN-v2 |
+| ------------------------------- | ------- | ------ | ------------ | ------ | ---------- | ------ | ---------- | -------------- |
+| **HyperCLOVA X SEED 14B Think** | 0.7269  | 0.7208 | 0.8506       | 0.8570 | 0.6411     | 0.5428 | 0.7482     | 0.6682         |
+| QWEN3-8B                        | 0.6759  | 0.6206 | 0.6618       | 0.7919 | 0.6471     | 0.5543 | 0.7186     | 0.5773         |
+| QWEN3-14B                       | 0.7079  | 0.6707 | 0.6975       | 0.8174 | 0.6979     | 0.5864 | 0.7507     | 0.5927         |
+**English/American Culture**
+| Model                           | Average | MMLU   | BigBench-Hard | Hellaswag | Winogrande | PIQA   | ARC-challenge | Social IQa |
+| ------------------------------- | ------- | ------ | ------------- | --------- | ---------- | ------ | ------------- | ---------- |
+| **HyperCLOVA X SEED 14B Think** | 0.6614  | 0.7121 | 0.6216        | 0.6125    | 0.7593     | 0.7791 | 0.6246        | 0.5205     |
+| QWEN3-8B                        | 0.6548  | 0.7490 | 0.6072        | 0.5817    | 0.7198     | 0.7666 | 0.6433        | 0.5159     |
+| QWEN3-14B                       | 0.6807  | 0.7885 | 0.6325        | 0.6143    | 0.7356     | 0.8025 | 0.6698        | 0.5215     |
+### Reasoning Performance Comparison
+**Korean/Korea Culture**
+| Model | KMMLU | CSAT-ko-2025 | KorMedMCQA | KoBALT | HAERAE | CLIcK | KoBigBench | LogicKor |
+|-----------------------------------------|--------|--------|--------|--------|--------|--------|--------|------|
+| HyperCLOVA X SEED 14B Think **(Think)** | 0.6649 | 0.7516 | 0.6933 | 0.4500 | 0.8537 | 0.7280 | 0.7974 | 8.74 |
+| QWEN3-8B                                | 0.5543 | 0.7200 | 0.6782 | 0.3060 | 0.6618 | 0.6690 | 0.7850 | 8.92 |
+| QWEN3-14B                               | 0.4930 | 0.7710 | 0.6850 | 0.3840 | 0.7410 | 0.6880 | 0.8380 | 9.15 |
+**Coding/Math**
+| Model | GSM8k | MATH500 | HumanEval | MBPP |
+|-----------------------------------------|--------|--------|--------|--------|
+| HyperCLOVA X SEED 14B Think | 0.9553 | 0.9380 | 0.9451 | 0.8759 |
+| QWEN3-14B                   | 0.9590 | 0.9680 | 0.9570 | 0.9080 |
+### Non-Think / Think Performance Comparison
+| Model | GSM8k | GPT4Eval | MT Bench | Arena-Hard-v0.1 |
+|---------------------------------------------|--------|--------|--------|--------|
+| HyperCLOVA X SEED 14B Think **(Non-think)** | 0.9348 | 0.6741 | 8.2063 | 0.2733 |
+| HyperCLOVA X SEED 14B Think **(Think)**     | 0.9553 | 0.8200 | 8.8313 | 0.5826 |
+## ChatML Block
+The chat template for HyperCLOVA X consists of the following elements.
+- **tool_list** : A list of tools available to the model (in JSON format). If no tools are available, an empty block should be provided.
+- **system** : System prompt. If not are available, an empty block should be provided.
+- **user** : User input.
+- **assistant** : Assistant output.
+The basic configuration of ChatML block is as follows.
+```
+<|im_start|>{tool_list/system/user/assistant}
+{content}<|im_end|>
+```
+- `<|im_start|>` : Start token of ChatML block
+- `<|im_end|>` : End token of ChatML block
+## (ChatML) General Conversation
+### First turn
+Given a two-turn conversation between the user and the assistant (`user_query_1`, `assistant_answer_1`, `user_query_2`, `assistant_answer_2`), the prompt for the first-turn can be constructed in its simplest form as follows:
+```
+<|im_start|>tool_list
+<|im_end|>
+<|im_start|>system
+- You are "CLOVA X," an AI language model developed by NAVER.
+- The current date is {day of the week}, {month} {dd}, {yyyy}.<|im_end|>
+<|im_start|>user
+{user_query_1}<|im_end|>
+<|im_start|>assistant
+```
+After the `<|im_end|>` token (indicating the end of the ChatML block), the model generates text up to the `<|endofturn|>` token. This output corresponds to the assistant's first response (`assistant_answer_1`).
+### Second turn
+Based on the assistant's first response (`assistant_answer_1`), when the user asks an additional question (`user_query_2`), the prompt input to the model is constructed as follows:
+```
+{following the previous context}
+{assistant_answer_1}<|im_end|><|endofturn|>
+<|im_start|>user
+{user_query_2}<|im_end|>
+<|im_start|>assistant
+```
+As in the previous turn, generation continues until the `<|endofturn|>` token appears after `<|im_end|>`. This corresponds to the assistant's second response (`assistant_answer_2`).
+## (ChatML) Function Call (Using Tools)
+Insert the list of tools available into the tool_list block as a JSON list. For example, the following is a case where the only available tool is `get_weather`.
+```
+<|im_start|>tool_list
+[{"name": "get_weather", "description": "Check the current weather at the requested location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "Name of the city"}}, "required": ["location"]}}]<|im_end|>
+```
+Additional instructions can be included in the system prompt if needed, and it is recommended to format them as `- {content}`. For example:
+```
+<|im_start|>system
+- In this environment, various tools can be used to answer users' questions.
+- You are "CLOVA X," an AI language model developed by NAVER.
+- Begin by creating a plan for solving the problem, and then utilize the tools accordingly to address the problem.
+- The current date is {day of the week}, {month} {dd}, {yyyy}.
+- Latest information such as news, stock prices, and shopping is retrieved through the tool_list.
+- If external tools are required, the assistant should not answer directly but must first obtain the necessary information via the assistant -> tool/function_call role, and then respond.<|im_end|>
+```
+### First turn
+Suppose the user gives the instruction, 'Tell me the weather in Seoul.' The prompt input to the model would then be as follows:
+```
+<|im_start|>tool_list
+[{"name": "get_weather", "description": "Check the current weather at the requested location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "Name of the city"}}, "required": ["location"]}}]<|im_end|>
+<|im_start|>system
+- In this environment, various tools can be used to answer users' questions.
+- You are "CLOVA X," an AI language model developed by NAVER.
+- Begin by creating a plan for solving the problem, and then utilize the tools accordingly to address the problem.
+- The current date is {day of the week}, {month} {dd}, {yyyy}.
+- Latest information such as news, stock prices, and shopping is retrieved through the tool_list.
+- If external tools are required, the assistant should not answer directly but must first obtain the necessary information via the assistant -> tool/function_call role, and then respond.<|im_end|>
+<|im_start|>user
+Tell me the weather in Seoul<|im_end|>
+<|im_start|>assistant
+```
+Generation continues until either the `<|stop|>` or `<|endofturn|>` token appears immediately after `<|im_end|>`. An example of the model’s output is shown below. HyperCLOVA X checks the list of available tools (tool_list), selects the appropriate tool (`get_weather`), and returns the necessary information for the tool call in JSON format.
+```
+{following the previous context}
+Let's check the weather using the get_weather tool.<|im_end|>
+<|im_start|>assistant -> tool/function_call
+{"name": "get_weather","input": {"location":"Seoul"}}<|im_end|><|stop|>
+```
+- `assistant -> tool/function_call` means that the assistant(model) invokes a function call.
+### Second turn
+The model stopped generating because `<|stop|>` token appeared immediately after `<|im_end|>`. Based on the information generated, `get_weather`should now be called. Calling an external function and parsing the result must be implemented additionally. For explanation, let's assume that the result of calling and parsing an external function is `{"result":{"location": "Seoul", "weather": "Sunny", "temperature": 25}}`.
+The model is now ready to respond to the second turn. Using all the information gathered so far, input the following prompt into the model and have it follow accordingly.
+```
+{following the previous context}
+<|im_start|>tool/function_call
+{"result":{"location": "Seoul", "weather": "Sunny", "temperature": 25}}<|im_end|>
+<|im_start|>assistant
+```
+- `tool/function_call` means it should pass the result of the function call to the assistant(model).
+Just like in the previous turn, generation continues until the `<|stop|>` or `<|endofturn|>` token appears immediately after `<|im_end|>`. If the model behaves as expected, the output will look like this.
+```
+{following the previous context}
+The weather in Seoul is clear and the temperature is 25 degrees.<|im_end|><|endofturn|>
+```
+## (ChatML) Inducing reasoning/non-reasoning
+HyperCLOVA X can handle both reasoning and non-reasoning tasks. There is a difference depending on whether the assistant is prompted to 'think' before responding. Based on the previous example, to make HyperCLOVA X respond in reasoning mode, you can input the prompt into the model as follows (excluding the tool_list and system)
+```
+<|im_start|>user
+Tell me the weather in Seoul<|im_end|>
+<|im_start|>assistant/think
+```
+- Note that the prompt ends with `assistant/think\n`(think + `\n).
+- Generation continues until either the <|stop|> or <|endofturn|> token appears immediately after `<|im_end|>`.
+To have the assistant respond in non-reasoning mode (i.e., answer directly), you can input the following prompt.
+```
+<|im_start|>user
+Tell me the weather in Seoul<|im_end|>
+<|im_start|>assistant
+```
+- Note that the prompt ends with `assistant\n`(assistant + `\n`).
+- Generation continues until either the <|stop|> or <|endofturn|> token appears immediately after `<|im_end|>`.
+### Adjusting inference length
+The length of reasoning can be controlled by appending "\nThink for maximum {N} tokens" at the end of the user's utterance.
+Example
+Suppose the user says, "Tell me the prime number closest to 1000."
+If you want the model to reason for approximately 1024 tokens before answering, you would construct the prompt as follows:
+```
+<|im_start|>tool_list
+<|im_end|>
+<|im_start|>system
+- You are "CLOVA X," an AI language model developed by NAVER.
+- The current date is {day of the week}, {month} {dd}, {yyyy}.<|im_end|>
+<|im_start|>user
+Tell me the prime number closest to 1000.
+Think for maximum 1024 tokens.<|im_end|>
+<|im_start|>assistant/think
+```
+- Adjusting the reasoning length means guiding the model to reason for approximately the specified number of tokens; it does not always generate for exact number of tokens given.
+### Multiturn inference
+When engaging in multi-turn conversations with reasoning enabled, the reasoning content from previous turns (`<|im_start|>assistant/think\n...<|im_end|>`) is not included in the next turn's prompt. Instead, as in the standard multi-turn dialogue format, only the final assistant response is included in the prompt passed to the model.
+The same applies when a function call is made during reasoning: the reasoning content is excluded, and only the function call block (`<|im_start|>assistant -> tool/function_call\n...<|im_end|><|stop|>`) is included in the prompt provided to the model.
+Example: Solving a simple math problem
+Suppose the user sends a request: "Calculate the sum of odd numbers from 1 to 10."
+The following explains step-by-step how this is processed as a multi-turn reasoning interaction.
+1. First Turn
+Input Prompt
+```
+<|im_start|>tool_list
+<|im_end|>
+<|im_start|>system
+<|im_end|>
+<|im_start|>user
+Calculate the sum of odd numbers from 1 to 10
+<|im_end|>
+<|im_start|>assistant/think
+```
+The model begins to generate inference contents.
+```
+{following the previous context}
+The user wants me to find the sum of the odd numbers from 1 to 10. First, I need to clearly understand the problem. I should identify which numbers between 1 and 10 are odd. Going through the numbers in order, the odd numbers are 1, 3, 5, 7, and 9. Since 10 is even, it is excluded. Now, I need to add these numbers together. Adding them step by step: 1+3=4, plus 5 gives 9, plus 7 gives 16, and finally adding 9 gives 25. But is there another way to calculate this? For example, I could use the formula for the sum of an arithmetic sequence. The odd numbers here form an arithmetic sequence with the first term 1 and a common difference of 2. I need to count the number of terms first—there are 5 odd numbers between 1 and 10. The formula for the sum of an arithmetic sequence is n*(a₁ + aₙ)/2, where n is 5, a₁ is 1, and aₙ is 9. Plugging these in, we get 5*(1 + 9)/2 = 5*10/2 = 25, which matches the previous result. Since both methods give 25, I’ll double-check that this is the answer the user expects, and then provide a concise explanation.<|im_end|>
+<|im_start|>assistant
+The odd numbers from 1 to 10 are **1, 3, 5, 7, 9**.
+The sum of these numbers is calculated as follows:
+\[
+1 + 3 + 5 + 7 + 9 = 25
+\]
+Alternatively, the sum can be easily calculated using **the formula for the sum of an arithmetic sequence**:
+- First term (\(a_1\)) = 1
+- Last term (\(a_n\)) = 9
+- Number of terms (\(n\)) = 5
+\[
+\text{Sum} = \frac{n(a_1 + a_n)}{2} = \frac{5 \times (1 + 9)}{2} = \frac{5 \times 10}{2} = 25
+\]
+**Result:** The answer is **25**.<|im_end|><|endofturn|>
+```
+As the `<|endofturn|>` token has been generated, the model concludes the current turn and awaits the next user input.
+2. Second Turn
+Suppose you want to follow up with: "What is the result of adding 10 to that sum?" Here, the prompt is created by excluding the reasoning content (assistant/think) from the first turn and including only the final response (assistant).
+- Input Prompt
+```
+<|im_start|>tool_list
+<|im_end|>
+<|im_start|>system
+<|im_end|>
+<|im_start|>user
+Calculate the sum of odd numbers from 1 to 10
+<|im_end|>
+<|im_start|>assistant
+The odd numbers from 1 to 10 are **1, 3, 5, 7, 9**.
+The sum of these numbers is calculated as follows:
+\[
+1 + 3 + 5 + 7 + 9 = 25
+\]
+Alternatively, the sum can be easily calculated using **the formula for the sum of an arithmetic sequence**:
+- First term (\(a_1\)) = 1
+- Last term (\(a_n\)) = 9
+- Number of terms (\(n\)) = 5
+\[
+\text{Sum} = \frac{n(a_1 + a_n)}{2} = \frac{5 \times (1 + 9)}{2} = \frac{5 \times 10}{2} = 25
+\]
+**Result:** The answer is **25**.<|im_end|><|endofturn|>
+<|im_end|>
+<|im_start|>user
+What is the result of adding 10 to that sum?
+<|im_end|>
+<|im_start|>assistant/think
+```
+- Excluded contents in the second turn input
+```
+<|im_start|>assistant/think
+{following the previous context}
+The user wants me to find the sum of the odd numbers from 1 to 10. First, I need to clearly understand the problem. I should identify which numbers between 1 and 10 are odd. Going through the numbers in order, the odd numbers are 1, 3, 5, 7, and 9. Since 10 is even, it is excluded. Now, I need to add these numbers together. Adding them step by step: 1+3=4, plus 5 gives 9, plus 7 gives 16, and finally adding 9 gives 25. But is there another way to calculate this? For example, I could use the formula for the sum of an arithmetic sequence. The odd numbers here form an arithmetic sequence with the first term 1 and a common difference of 2. I need to count the number of terms first—there are 5 odd numbers between 1 and 10. The formula for the sum of an arithmetic sequence is n*(a₁ + aₙ)/2, where n is 5, a₁ is 1, and aₙ is 9. Plugging these in, we get 5*(1 + 9)/2 = 5*10/2 = 25, which matches the previous result. Since both methods give 25, I’ll double-check that this is the answer the user expects, and then provide a concise explanation.<|im_end|>
+```
+For all later turns, the reasoning (think) content from previous turns is not added to the prompt as well.
+### Difference between <|stop|> and <|endofturn|>
+- **Similarity**
+    - They both act as signals to stop the model from generating further responses.
+- **Difference**
+    - <|stop|>: After the response generation is halted and the tool_invoke results are processed, the AI turn resumes.
+    - <|endofturn|>: After the response generation is halted, the AI turn is fully terminated, and the model waits for the user's next input.
+## **Huggingface Usage Example**
+After downloading the model binaries, including the configuration files, to a local path(`/path/to/hyperclova-x-seed-think-14b`), you can run the following in a Python environment with the [Huggingface library](https://huggingface.co/docs/transformers/installation)(verified to work with version 4.45.0) and [timm(pytorch-image-models)](https://github.com/huggingface/pytorch-image-models) installed.
+You can use the `apply_chat_template` parameter to explicitly enable or disable the reasoning feature.
+- The default value for both options is `None`, in which case the model decides on its own whether to reason before answering or to answer directly without reasoning.
+- `force_reasoning=True`: Forces the model to always reason before answering.
+- `skip_reasoning=True`: Forces the model to answer directly without reasoning.
+- Passing `None` or `False` has the same effect.
+- If both are set to True, `force_reasoning=True` takes precedence.
+```python
+# Default example
+inputs = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+# By adding force_reasoning=True, the model is forced to always reason before responding
+inputs = tokenizer.apply_chat_template(chat, add_generation_prompt=True, force_reasoning=True, return_dict=True, return_tensors="pt")
+# By adding skip_reasoning=True, the model is forced to always answer directly without reasoning
+inputs = tokenizer.apply_chat_template(chat, add_generation_prompt=True, skip_reasoning=True, return_dict=True, return_tensors="pt")
+```
+### Non-think Example Code
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+chat = [
+  {"role": "system", "content": "- In this environment, various tools can be used to answer users' questions.\n- You are \"CLOVA X,\" an AI language model developed by NAVER.\n- Begin by creating a plan for solving the problem, and then utilize the tools accordingly to address the problem.\n- The current date is Monday, July 21, 2025.\n- Latest information such as news, stock prices, and shopping is retrieved through the tool_list.\n- If external tools are required, the assistant should not answer directly but must first obtain the necessary information via the assistant -> tool/function_call role, and then respond."},
+  {"role": "user", "content": "Explain in as much detail as possible the relationship between the Schrödinger equation and quantum mechanics."},
+]
+# By adding skip_reasoning=True, the model is forced to always answer directly without reasoning
+inputs = tokenizer.apply_chat_template(chat, add_generation_prompt=True, skip_reasoning=True, return_dict=True, return_tensors="pt")
+inputs = inputs.to("cuda")
+output_ids = model.generate(
+    **inputs,
+    max_length=1024,
+    stop_strings=["<|endofturn|>", "<|stop|>"],
+    temperature=0.5,
+    top_p=0.6,
+    repetition_penalty=1.05,
+    tokenizer=tokenizer
+)
+print(tokenizer.batch_decode(output_ids))
+```
+### Think Example Code
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+chat = [
+  {"role": "system", "content": "- In this environment, various tools can be used to answer users' questions.\n- You are \"CLOVA X,\" an AI language model developed by NAVER.\n- Begin by creating a plan for solving the problem, and then utilize the tools accordingly to address the problem.\n- The current date is Monday, July 21, 2025.\n- Latest information such as news, stock prices, and shopping is retrieved through the tool_list.\n- If external tools are required, the assistant should not answer directly but must first obtain the necessary information via the assistant -> tool/function_call role, and then respond."},
+  {"role": "user", "content": "Explain in as much detail as possible the relationship between the Schrödinger equation and quantum mechanics."},
+]
+# By adding force_reasoning=True, the model is forced to always reason before responding
+inputs = tokenizer.apply_chat_template(chat, add_generation_prompt=True, force_reasoning=True, return_dict=True, return_tensors="pt")
+inputs = inputs.to("cuda")
+output_ids = model.generate(
+    **inputs,
+    max_length=1024,
+    stop_strings=["<|endofturn|>", "<|stop|>"],
+    temperature=0.5,
+    top_p=0.6,
+    repetition_penalty=1.05,
+    tokenizer=tokenizer
+)
+print(tokenizer.batch_decode(output_ids))
+```
+### Hybrid(the model decides whether to use think or non-think mode) Example Code
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+chat = [
+  {"role": "system", "content": "- In this environment, various tools can be used to answer users' questions.\n- You are \"CLOVA X,\" an AI language model developed by NAVER.\n- Begin by creating a plan for solving the problem, and then utilize the tools accordingly to address the problem.\n- The current date is Monday, July 21, 2025.\n- Latest information such as news, stock prices, and shopping is retrieved through the tool_list.\n- If external tools are required, the assistant should not answer directly but must first obtain the necessary information via the assistant -> tool/function_call role, and then respond."},
+  {"role": "user", "content": "Explain in as much detail as possible the relationship between the Schrödinger equation and quantum mechanics."},
+]
+# The model decides whether to answer after reasoning or to respond immediately without reasoning
+inputs = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = inputs.to("cuda")
+output_ids = model.generate(
+    **inputs,
+    max_length=1024,
+    stop_strings=["<|endofturn|>", "<|stop|>"],
+    temperature=0.5,
+    top_p=0.6,
+    repetition_penalty=1.05,
+    tokenizer=tokenizer
+)
+print(tokenizer.batch_decode(output_ids))
+```
+### Example code for function calls (tool usage)
+For a scenario involving tool usage, you can execute it as follows.
+```python
+import json
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# 1) The name of the tool should be written as function_call.{{ name }}.
+# 2) Parameters follow the specifications described at https://platform.openai.com/docs/guides/function-calling?api-mode=responses.
+tool_list = [
+    {
+        "type": "function",
+        "function": {
+            "name": "add",
+            "description": "Add two numbers.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "x": {"type": "number", "description": "First number"},
+                    "y": {"type": "number", "description": "Second number"}
+                },
+                "required": ["x", "y"]
+            }
+        }
+    }, {
+        "type": "function",
+        "function": {
+            "name": "subtract",
+            "description": "Subtract two numbers.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "x": {"type": "number", "description": "First number"},
+                    "y": {"type": "number", "description": "Second number"}
+                },
+                "required": ["x", "y"]
+            }
+        }
+    }
+]
+chat = [
+  {"role": "system", "content": "- In this environment, various tools can be used to answer users' questions.\n- You are \"CLOVA X,\" an AI language model developed by NAVER.\n- Begin by creating a plan for solving the problem, and then utilize the tools accordingly to address the problem.\n- The current date is Monday, July 21, 2025.\n- Latest information such as news, stock prices, and shopping is retrieved through the tool_list.\n- If external tools are required, the assistant should not answer directly but must first obtain the necessary information via the assistant -> tool/function_call role, and then respond."},
+  {"role": "user", "content": "What is 1588 + 1234? Please calculate it using the provided tool."},
+]
+inputs = tokenizer.apply_chat_template(chat, tools=tool_list, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = inputs.to("cuda")
+output_ids = model.generate(
+    **inputs,
+    max_length=1024,
+    stop_strings=["<|endofturn|>", "<|stop|>"],
+    temperature=0.5,
+    top_p=0.6,
+    repetition_penalty=1.05,
+    tokenizer=tokenizer
+)
+print(tokenizer.batch_decode(output_ids))
+```
+- If you have any questions or issues regarding usage, please leave them as an issue in the Discussions section of this page.
+## **vLLM Usage Example**
+The HyperCLOVA X SEED Think model is built on a custom LLM architecture based on the LLaMA architecture, incorporating μP and Peri-LN techniques. For convenient use with vLLM, it is available as a dedicated vLLM plugin that can be installed and used with ease once vLLM is set up.
+1. Download vLLM plugin source code
+    ```bash
+    git clone https://github.com/NAVER-Cloud-HyperCLOVA-X/hcx-vllm-plugin
+    ```
+2. vLLM Plugin Build & Installation: While keeping the NAVER-Cloud-HyperCLOVA-X/hcx-vllm-plugin path downloaded in step 1, refer to the commands below.
+    ```bash
+    pip install -e .
+    ```
+After downloading the model checkpoint to a local path (`/path/to/hyperclova-x-seed-think-14b`), you can perform text inference by running the following commands on a GPU environment with A100 or higher.
+```bash
+python -m vllm.entrypoints.openai.api_server --model=/path/to/hyperclova-x-seed-think-14b --trust_remote_code --port=8000
+curl http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"prompt": "<|im_start|>tool_list\n<|im_end|>\n<|im_start|>system\n- The AI language model is named \"CLOVA X\" and was developed by NAVER.\n- Today is Friday, July 18, 2025.<|im_end|>\n<|im_start|>user\nExplain in as much detail as possible the relationship between the Schrödinger equation and quantum mechanics.<|im_end|>\n<|im_start|>assistant/think\n",
+"top_k":-1,
+"temperature":0.5,
+"top_p":0.6,
+"repetition_penalty":1.05,
+"stop":["<|im_end|><|endofturn|>", "<|im_end|><|stop|>"],
+"max_tokens":8192,
+"skip_special_tokens":false
+}'
+```
+## License
+The model is licensed under [HyperCLOVA X SEED Model License Agreement](./LICENSE)
+## Citation
+```
+@misc{navercloudhyperclovaxteam2025hyperclovaxthinktechnical,
+      title={HyperCLOVA X THINK Technical Report},
+      author={NAVER Cloud HyperCLOVA X Team},
+      year={2025},
+      eprint={2506.22403},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2506.22403},
+}
+```
+## Questions
+For any other questions, please feel free to contact us at [[email protected]](mailto:[email protected]).

added_tokens.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "<EMAIL>": 110521,
+  "<KEY>": 110522,
+  "<NAME>": 110520,
+  "<PASSWORD>": 110523,
+  "<code_to_intermediate>": 110502,
+  "<empty_output>": 110501,
+  "<file_sep>": 110492,
+  "<intermediate_to_code>": 110503,
+  "<issue_closed>": 110495,
+  "<issue_comment>": 110494,
+  "<issue_start>": 110493,
+  "<jupyter_code>": 110498,
+  "<jupyter_output>": 110499,
+  "<jupyter_script>": 110500,
+  "<jupyter_start>": 110496,
+  "<jupyter_text>": 110497,
+  "<pr>": 110504,
+  "<pr_base>": 110507,
+  "<pr_base_code>": 110509,
+  "<pr_comment>": 110512,
+  "<pr_diff>": 110510,
+  "<pr_diff_hunk>": 110511,
+  "<pr_diff_hunk_comment_line>": 110519,
+  "<pr_event_id>": 110513,
+  "<pr_file>": 110508,
+  "<pr_in_reply_to_comment_id>": 110518,
+  "<pr_in_reply_to_review_id>": 110517,
+  "<pr_is_merged>": 110506,
+  "<pr_review>": 110514,
+  "<pr_review_comment>": 110516,
+  "<pr_review_state>": 110515,
+  "<pr_status>": 110505,
+  "<repo_name>": 110491
+}

config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "architectures": [
+    "HyperCLOVAXForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "attn_pdrop": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_hyperclovax.HyperCLOVAXConfig",
+    "AutoModel": "modeling_hyperclovax.HyperCLOVAXModel",
+    "AutoModelForCausalLM": "modeling_hyperclovax.HyperCLOVAXForCausalLM"
+  },
+  "bos_token_id": 100257,
+  "embd_pdrop": 0.0,
+  "embedding_multiplier": 10.0,
+  "end_token_id": 100257,
+  "eos_token_id": 100257,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 6144,
+  "initializer_range": 0.012727922061357854,
+  "intermediate_size": 14336,
+  "logits_scaling": 0.125,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "hyperclovax",
+  "num_attention_heads": 48,
+  "num_hidden_layers": 38,
+  "num_key_value_heads": 8,
+  "pad_token_id": 100257,
+  "pretraining_tp": 1,
+  "resid_pdrop": 0.0,
+  "residual_multiplier": 1.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 100000000,
+  "summary_first_dropout": 0.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "use_cache": false,
+  "use_post_norm": true,
+  "vocab_size": 110592
+}

configuration_hyperclovax.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# coding=utf-8
+# This file was created for the HyperCLOVA X SEED 14B Think architecture.
+# partially copied and modified from https://github.com/huggingface/transformers
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HyperCLOVAX model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+class HyperCLOVAXConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HyperCLOVAXModel`]. It is used to instantiate an HyperCLOVAX
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the HyperCLOVAX.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the HyperCLOVAX model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HyperCLOVAXModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_heads
+        embedding_multiplier (`float, *optional*, defaults to `None`):
+            Multiplier applied to the embedding weights. If `None`, it is equivalent to `1.0`.
+        logits_scaling (`float, *optional*, defaults to `None`):
+            Scaling factor for logits. If `None`, it is equivalent to `1.0`.
+        attention_multiplier (`float, *optional*, defaults to `None`):
+            Multiplier applied to the attention weights. If `None`, it is equivalent to `self.head_dim ** -0.5`.
+        residual_multiplier (`float, *optional*, defaults to `None`):
+            Scaling factor for residual connections. If `None`, it is equivalent to `1.0`.
+        use_post_norm (`bool`, *optional*, defaults to `False`):
+            Determines whether to apply Peri-Layer Normalization. Set to True to enable this feature.
+    ```python
+    >>> from transformers import HyperCLOVAXModel, HyperCLOVAXConfig
+    >>> # Initializing a HyperCLOVAX HyperCLOVAX style configuration
+    >>> configuration = HyperCLOVAXConfig()
+    >>> # Initializing a model from the HyperCLOVAX style configuration
+    >>> model = HyperCLOVAXModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "hyperclovax"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        head_dim=None,
+        embedding_multiplier=None, # MuP
+        logits_scaling=None, # MuP
+        attention_multiplier=None, # MuP
+        residual_multiplier=None, # MuP
+        use_post_norm=False, # Peri-LN (post-norm)
+        auto_map={
+            "AutoConfig": "configuration_hyperclovax.HyperCLOVAXConfig",
+            "AutoModel": "modeling_hyperclovax.HyperCLOVAXModel",
+            "AutoModelForCausalLM": "modeling_hyperclovax.HyperCLOVAXForCausalLM"
+        },
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        # rope_config_validation(self)
+        # MuP
+        self.embedding_multiplier = embedding_multiplier if embedding_multiplier is not None else 1.0
+        self.logits_scaling = logits_scaling if logits_scaling is not None else 1.0
+        self.attention_multiplier = attention_multiplier if attention_multiplier is not None else self.head_dim ** -0.5
+        self.residual_multiplier = residual_multiplier if residual_multiplier is not None else 1.0
+        # Peri-LN (post-norm)
+        self.use_post_norm = use_post_norm
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            auto_map=auto_map,
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 100257,
+  "eos_token_id": 100257,
+  "pad_token_id": 100257,
+  "transformers_version": "4.52.4",
+  "use_cache": false
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:305362a08287f1a07c8901b17569030003342501439732b35372f141488e624d
+size 4831938472

model-00002-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff239283211a4d509ae03a47d26eb81981a1e6629b8593e7fea6f7bc982cc7ee
+size 4932899128

model-00003-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a87408267364edb5e0ad365c01d33272ecd767eef3a59dcae20b5c4e2b80d2d9
+size 4932800744

model-00004-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34be38a9cd94474e2145567c0d8d8e772e38811a6b7cc7ff21ad32b78cd82e28
+size 4932899160

model-00005-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04b93ef8634259a055d34eda87bc7f0c07493be1729979da93c7fd0b63b4cfc8
+size 4932800784

model-00006-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0166c99db8788a872fe9677d88095116586807e61466caeffb1b317815025485
+size 4932899168

model-00007-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3091a6c988544d07a79a12ac4e6327d0b80802af7463f657fa55706760a7b16
+size 4932800784

model-00008-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:437804c0b8b0ac88e24253304a7c0d27c0e94ee54fdba2d6b6474f0fcfd71f77
+size 4932899168

model-00009-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bae7aa1fb3ec4ac0d8a9046a15b8674d80b1165ec423b66c3429416846bd076
+size 4932800784

model-00010-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17ec60ba7881c3afa594a584bde88d806582eab0f7768a70549d377b06544c27
+size 4932899168

model-00011-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c2e182da134b1702e84be4f6bff0157182bb8870963b470da45acc4fd9102f5
+size 4932800784

model-00012-of-00012.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ced46cb6d54cc2ace518803b1d350211b110560a7d841412b61d22b09f55e
+size 4832061536

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,428 @@

+{
+  "metadata": {
+    "total_size": 58992451584
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00012-of-00012.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.post_norm1.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.post_norm2.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00002-of-00012.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00012.safetensors",
+    "model.layers.1.post_norm1.weight": "model-00002-of-00012.safetensors",
+    "model.layers.1.post_norm2.weight": "model-00002-of-00012.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00012.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00004-of-00012.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00012.safetensors",
+    "model.layers.10.post_norm1.weight": "model-00004-of-00012.safetensors",
+    "model.layers.10.post_norm2.weight": "model-00004-of-00012.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.post_norm1.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.post_norm2.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00005-of-00012.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00005-of-00012.safetensors",
+    "model.layers.12.post_norm1.weight": "model-00005-of-00012.safetensors",
+    "model.layers.12.post_norm2.weight": "model-00005-of-00012.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.post_norm1.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.post_norm2.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.post_norm1.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.post_norm2.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00006-of-00012.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00006-of-00012.safetensors",
+    "model.layers.15.post_norm1.weight": "model-00006-of-00012.safetensors",
+    "model.layers.15.post_norm2.weight": "model-00006-of-00012.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00005-of-00012.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00006-of-00012.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00006-of-00012.safetensors",
+    "model.layers.16.post_norm1.weight": "model-00006-of-00012.safetensors",
+    "model.layers.16.post_norm2.weight": "model-00006-of-00012.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.post_norm1.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.post_norm2.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.post_norm1.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.post_norm2.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00006-of-00012.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00007-of-00012.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00007-of-00012.safetensors",
+    "model.layers.19.post_norm1.weight": "model-00007-of-00012.safetensors",
+    "model.layers.19.post_norm2.weight": "model-00007-of-00012.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00002-of-00012.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00012.safetensors",
+    "model.layers.2.post_norm1.weight": "model-00002-of-00012.safetensors",
+    "model.layers.2.post_norm2.weight": "model-00002-of-00012.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00007-of-00012.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00007-of-00012.safetensors",
+    "model.layers.20.post_norm1.weight": "model-00007-of-00012.safetensors",
+    "model.layers.20.post_norm2.weight": "model-00007-of-00012.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.post_norm1.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.post_norm2.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00008-of-00012.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00008-of-00012.safetensors",
+    "model.layers.22.post_norm1.weight": "model-00008-of-00012.safetensors",
+    "model.layers.22.post_norm2.weight": "model-00008-of-00012.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00007-of-00012.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00008-of-00012.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00008-of-00012.safetensors",
+    "model.layers.23.post_norm1.weight": "model-00008-of-00012.safetensors",
+    "model.layers.23.post_norm2.weight": "model-00008-of-00012.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.post_norm1.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.post_norm2.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.post_norm1.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.post_norm2.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00008-of-00012.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00009-of-00012.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00009-of-00012.safetensors",
+    "model.layers.26.post_norm1.weight": "model-00009-of-00012.safetensors",
+    "model.layers.26.post_norm2.weight": "model-00009-of-00012.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.post_norm1.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.post_norm2.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.post_norm1.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.post_norm2.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00010-of-00012.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00010-of-00012.safetensors",
+    "model.layers.29.post_norm1.weight": "model-00010-of-00012.safetensors",
+    "model.layers.29.post_norm2.weight": "model-00010-of-00012.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00009-of-00012.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00012.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00012.safetensors",
+    "model.layers.3.post_norm1.weight": "model-00002-of-00012.safetensors",
+    "model.layers.3.post_norm2.weight": "model-00002-of-00012.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00010-of-00012.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00010-of-00012.safetensors",
+    "model.layers.30.post_norm1.weight": "model-00010-of-00012.safetensors",
+    "model.layers.30.post_norm2.weight": "model-00010-of-00012.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.post_norm1.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.post_norm2.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.post_norm1.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.post_norm2.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00010-of-00012.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00011-of-00012.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00011-of-00012.safetensors",
+    "model.layers.33.post_norm1.weight": "model-00011-of-00012.safetensors",
+    "model.layers.33.post_norm2.weight": "model-00011-of-00012.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.post_norm1.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.post_norm2.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.post_norm1.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.post_norm2.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.36.input_layernorm.weight": "model-00012-of-00012.safetensors",
+    "model.layers.36.mlp.down_proj.weight": "model-00012-of-00012.safetensors",
+    "model.layers.36.mlp.gate_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.36.mlp.up_proj.weight": "model-00012-of-00012.safetensors",
+    "model.layers.36.post_attention_layernorm.weight": "model-00012-of-00012.safetensors",
+    "model.layers.36.post_norm1.weight": "model-00012-of-00012.safetensors",
+    "model.layers.36.post_norm2.weight": "model-00012-of-00012.safetensors",
+    "model.layers.36.self_attn.k_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.36.self_attn.o_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.36.self_attn.q_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.36.self_attn.v_proj.weight": "model-00011-of-00012.safetensors",
+    "model.layers.37.input_layernorm.weight": "model-00012-of-00012.safetensors",
+    "model.layers.37.mlp.down_proj.weight": "model-00012-of-00012.safetensors",
+    "model.layers.37.mlp.gate_proj.weight": "model-00012-of-00012.safetensors",
+    "model.layers.37.mlp.up_proj.weight": "model-00012-of-00012.safetensors",
+    "model.layers.37.post_attention_layernorm.weight": "model-00012-of-00012.safetensors",
+    "model.layers.37.post_norm1.weight": "model-00012-of-00012.safetensors",
+    "model.layers.37.post_norm2.weight": "model-00012-of-00012.safetensors",
+    "model.layers.37.self_attn.k_proj.weight": "model-00012-of-00012.safetensors",
+    "model.layers.37.self_attn.o_proj.weight": "model-00012-of-00012.safetensors",
+    "model.layers.37.self_attn.q_proj.weight": "model-00012-of-00012.safetensors",
+    "model.layers.37.self_attn.v_proj.weight": "model-00012-of-00012.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00012.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00012.safetensors",
+    "model.layers.4.post_norm1.weight": "model-00002-of-00012.safetensors",
+    "model.layers.4.post_norm2.weight": "model-00002-of-00012.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00012.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00003-of-00012.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00012.safetensors",
+    "model.layers.5.post_norm1.weight": "model-00003-of-00012.safetensors",
+    "model.layers.5.post_norm2.weight": "model-00003-of-00012.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.post_norm1.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.post_norm2.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.post_norm1.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.post_norm2.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00004-of-00012.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00004-of-00012.safetensors",
+    "model.layers.8.post_norm1.weight": "model-00004-of-00012.safetensors",
+    "model.layers.8.post_norm2.weight": "model-00004-of-00012.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00012.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00004-of-00012.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00004-of-00012.safetensors",
+    "model.layers.9.post_norm1.weight": "model-00004-of-00012.safetensors",
+    "model.layers.9.post_norm2.weight": "model-00004-of-00012.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00004-of-00012.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00004-of-00012.safetensors",
+    "model.norm.weight": "model-00012-of-00012.safetensors"
+  }
+}

modeling_hyperclovax.py ADDED Viewed

	@@ -0,0 +1,979 @@

+# coding=utf-8
+# This file was created for the HyperCLOVA X SEED 14B Think architecture.
+# partially copied and modified from https://github.com/huggingface/transformers
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import LossKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
+from .configuration_hyperclovax import HyperCLOVAXConfig
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+    from transformers.integrations.flex_attention import make_flex_block_causal_mask
+logger = logging.get_logger(__name__)
+@use_kernel_forward_from_hub("RMSNorm")
+class HyperCLOVAXRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        HyperCLOVAXRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+ALL_LAYERNORM_LAYERS.append(HyperCLOVAXRMSNorm)
+class HyperCLOVAXRotaryEmbedding(nn.Module):
+    def __init__(self, config: HyperCLOVAXConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class HyperCLOVAXMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class HyperCLOVAXAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: HyperCLOVAXConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = getattr(config, "attention_multiplier", self.head_dim**-0.5) # MuP
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class HyperCLOVAXDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: HyperCLOVAXConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = HyperCLOVAXAttention(config=config, layer_idx=layer_idx)
+        self.mlp = HyperCLOVAXMLP(config)
+        self.input_layernorm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.use_post_norm = getattr(config, "use_post_norm", False)
+        # Peri-LN (post-norm)
+        if self.use_post_norm:
+            self.post_norm1 = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.post_norm2 = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.residual_multiplier = getattr(config, "residual_multiplier", 1.0) # MuP
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        if self.use_post_norm: # Peri-LN
+            hidden_states = self.post_norm1(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier # MuP
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.use_post_norm: # Peri-LN
+            hidden_states = self.post_norm2(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier # MuP
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+@auto_docstring
+class HyperCLOVAXPreTrainedModel(PreTrainedModel):
+    config_class = HyperCLOVAXConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HyperCLOVAXDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, HyperCLOVAXRMSNorm):
+            module.weight.data.fill_(1.0)
+@auto_docstring
+class HyperCLOVAXModel(HyperCLOVAXPreTrainedModel):
+    def __init__(self, config: HyperCLOVAXConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HyperCLOVAXDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HyperCLOVAXRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        # MuP
+        self.embedding_multiplier = getattr(config, "embedding_multiplier", 1.0)
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = inputs_embeds * self.embedding_multiplier # MuP
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **flash_attn_kwargs,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+@auto_docstring
+class HyperCLOVAXForCausalLM(HyperCLOVAXPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HyperCLOVAXModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.logits_scaling = getattr(config, "logits_scaling", 1.0)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, HyperCLOVAXForCausalLM
+        >>> model = HyperCLOVAXForCausalLM.from_pretrained("naver-hyperclovax/{model_name}")
+        >>> tokenizer = AutoTokenizer.from_pretrained("naver-hyperclovax/{model_name}")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        # MuP
+        logits = self.lm_head(hidden_states[:, slice_indices, :]) * self.logits_scaling
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring(
+    custom_intro="""
+    The HyperCLOVAX Model transformer with a sequence classification head on top (linear layer).
+    [`HyperCLOVAXForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class HyperCLOVAXForSequenceClassification(HyperCLOVAXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = HyperCLOVAXModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> SequenceClassifierOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        transformer_outputs: BaseModelOutputWithPast = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        hidden_states = transformer_outputs.last_hidden_state
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@auto_docstring
+class HyperCLOVAXForQuestionAnswering(HyperCLOVAXPreTrainedModel):
+    base_model_prefix = "transformer"
+    # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->HyperCLOVAX
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = HyperCLOVAXModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.transformer.embed_tokens
+    def set_input_embeddings(self, value):
+        self.transformer.embed_tokens = value
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> QuestionAnsweringModelOutput:
+        outputs: BaseModelOutputWithPast = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs.last_hidden_state
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring
+class HyperCLOVAXForTokenClassification(HyperCLOVAXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = HyperCLOVAXModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> TokenClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs.last_hidden_state
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "HyperCLOVAXForCausalLM",
+    "HyperCLOVAXModel",
+    "HyperCLOVAXPreTrainedModel",
+    "HyperCLOVAXForSequenceClassification",
+    "HyperCLOVAXForQuestionAnswering",
+    "HyperCLOVAXForTokenClassification",
+]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|fim_prefix|>",
+    "<|fim_middle|>",
+    "<|fim_suffix|>",
+    "<|endofprompt|>",
+    "<|_unuse_missing_100256|>",
+    "<|_unuse_missing_100261|>",
+    "<|_unuse_missing_100262|>",
+    "<|_unuse_missing_100263|>",
+    "<|_unuse_missing_100264|>",
+    "<|_unuse_missing_100265|>",
+    "<|_unuse_missing_100266|>",
+    "<|_unuse_missing_100267|>",
+    "<|_unuse_missing_100268|>",
+    "<|_unuse_missing_100269|>",
+    "<|_unuse_missing_100270|>",
+    "<|_unuse_missing_100271|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|stop|>",
+    "<|endofturn|>",
+    "<repo_name>",
+    "<file_sep>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>",
+    "<code_to_intermediate>",
+    "<intermediate_to_code>",
+    "<pr>",
+    "<pr_status>",
+    "<pr_is_merged>",
+    "<pr_base>",
+    "<pr_file>",
+    "<pr_base_code>",
+    "<pr_diff>",
+    "<pr_diff_hunk>",
+    "<pr_comment>",
+    "<pr_event_id>",
+    "<pr_review>",
+    "<pr_review_state>",
+    "<pr_review_comment>",
+    "<pr_in_reply_to_review_id>",
+    "<pr_in_reply_to_comment_id>",
+    "<pr_diff_hunk_comment_line>",
+    "<NAME>",
+    "<EMAIL>",
+    "<KEY>",
+    "<PASSWORD>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,501 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "100256": {
+      "content": "<|_unuse_missing_100256|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100257": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100258": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100259": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100260": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100261": {
+      "content": "<|_unuse_missing_100261|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100262": {
+      "content": "<|_unuse_missing_100262|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100263": {
+      "content": "<|_unuse_missing_100263|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100264": {
+      "content": "<|_unuse_missing_100264|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100265": {
+      "content": "<|_unuse_missing_100265|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100266": {
+      "content": "<|_unuse_missing_100266|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100267": {
+      "content": "<|_unuse_missing_100267|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100268": {
+      "content": "<|_unuse_missing_100268|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100269": {
+      "content": "<|_unuse_missing_100269|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100270": {
+      "content": "<|_unuse_missing_100270|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100271": {
+      "content": "<|_unuse_missing_100271|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100272": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100273": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100274": {
+      "content": "<|stop|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100275": {
+      "content": "<|endofturn|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100276": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110491": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110492": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110493": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110494": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110495": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110496": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110497": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110498": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110499": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110500": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110501": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110502": {
+      "content": "<code_to_intermediate>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110503": {
+      "content": "<intermediate_to_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110504": {
+      "content": "<pr>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110505": {
+      "content": "<pr_status>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110506": {
+      "content": "<pr_is_merged>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110507": {
+      "content": "<pr_base>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110508": {
+      "content": "<pr_file>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110509": {
+      "content": "<pr_base_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110510": {
+      "content": "<pr_diff>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110511": {
+      "content": "<pr_diff_hunk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110512": {
+      "content": "<pr_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110513": {
+      "content": "<pr_event_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110514": {
+      "content": "<pr_review>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110515": {
+      "content": "<pr_review_state>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110516": {
+      "content": "<pr_review_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110517": {
+      "content": "<pr_in_reply_to_review_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110518": {
+      "content": "<pr_in_reply_to_comment_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110519": {
+      "content": "<pr_diff_hunk_comment_line>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110520": {
+      "content": "<NAME>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110521": {
+      "content": "<EMAIL>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110522": {
+      "content": "<KEY>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110523": {
+      "content": "<PASSWORD>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|fim_prefix|>",
+    "<|fim_middle|>",
+    "<|fim_suffix|>",
+    "<|endofprompt|>",
+    "<|_unuse_missing_100256|>",
+    "<|_unuse_missing_100261|>",
+    "<|_unuse_missing_100262|>",
+    "<|_unuse_missing_100263|>",
+    "<|_unuse_missing_100264|>",
+    "<|_unuse_missing_100265|>",
+    "<|_unuse_missing_100266|>",
+    "<|_unuse_missing_100267|>",
+    "<|_unuse_missing_100268|>",
+    "<|_unuse_missing_100269|>",
+    "<|_unuse_missing_100270|>",
+    "<|_unuse_missing_100271|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|stop|>",
+    "<|endofturn|>",
+    "<repo_name>",
+    "<file_sep>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>",
+    "<code_to_intermediate>",
+    "<intermediate_to_code>",
+    "<pr>",
+    "<pr_status>",
+    "<pr_is_merged>",
+    "<pr_base>",
+    "<pr_file>",
+    "<pr_base_code>",
+    "<pr_diff>",
+    "<pr_diff_hunk>",
+    "<pr_comment>",
+    "<pr_event_id>",
+    "<pr_review>",
+    "<pr_review_state>",
+    "<pr_review_comment>",
+    "<pr_in_reply_to_review_id>",
+    "<pr_in_reply_to_comment_id>",
+    "<pr_diff_hunk_comment_line>",
+    "<NAME>",
+    "<EMAIL>",
+    "<KEY>",
+    "<PASSWORD>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{% if tools is not defined or tools is none %}\n    {{- '<|im_start|>tool_list\\n<|im_end|>\\n' }}\n{%- else %}\n    {{- '<|im_start|>tool_list\\n[' }}\n    {%- for tool in tools %}\n        {{- '{\"name\": \"' }}\n        {{- tool.function.name }}\n        {{- '\", ' }}\n        {{- '\"description\": \"' }}\n        {{- tool.function.description }}\n        {{- '\"' }}\n        {%- if tool.function.parameters is defined %}\n            {{- ', \"parameters\": ' }}\n            {{- tool.function.parameters | tojson }}\n        {%- endif %}\n        {{- '}' }}\n        {%- if not loop.last %}\n            {{- ', ' }}\n        {%- endif %}\n    {%- endfor %}\n{{- ']<|im_end|>\\n' }}\n{%- endif %}\n\n{%- set ns = namespace(is_searching=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.is_searching and (message.role == 'user' or message.role == 'tool') %}\n        {%- set ns.last_query_index = index %}\n        {%- set ns.is_searching = false %}\n    {%- endif %}\n{%- endfor %}\n\n{%- for message in messages %}\n    {%- if loop.index0 == 0 and message.role != 'system' %}\n        {{- '<|im_start|>system\\n<|im_end|>\\n' }}\n    {%- endif %}\n\n    {%- if message.content is string %}\n        {%- set content = message.content %}\n    {%- else %}\n        {%- set content = '' %}\n    {%- endif %}\n\n    {%- set reasoning_content = '' %}\n    {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n        {%- set reasoning_content = message.reasoning_content %}    \n    {%- endif %}\n    {%- if message.role == \"assistant\" %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if reasoning_content %}\n                {{- '<|im_start|>assistant/think\\n' + reasoning_content.strip('\\n') + '<|im_end|>\\n' }}\n            {%- endif %}\n        {%- endif %}\n\n        {%- if content %}\n            {{- '<|im_start|>assistant\\n' + content.strip('\\n') + '<|im_end|>' }}\n            {%- if message.tool_calls %}\n                {{- '\\n' }}\n            {%- else %}\n                {{- '<|endofturn|>\\n' }}\n            {%- endif %}\n        {%- endif %}\n\n        {%- if message.tool_calls %}\n            {{- '<|im_start|>assistant -> tool/function_call\\n[' }}\n            {%- for tool_call in message.tool_calls %}\n                {%- if not loop.first %}\n                    {{- ', ' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}' }}\n                {%- endfor %}\n            {{- ']<|im_end|><|stop|>\\n' }}\n\n        {%- endif %}\n    {%- elif message.role == \"tool\" %}\n        {{- '<|im_start|>tool/function_call\\n' + content + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {%- if force_reasoning is defined and force_reasoning is true %}\n        {{- '<|im_start|>assistant/think\\n' }}\n    {%- elif skip_reasoning is defined and skip_reasoning is true %}\n        {{- '<|im_start|>assistant\\n' }}\n    {%- else %}\n        {{- '<|im_start|>assistant' }}\n    {%- endif %}\n{%- endif %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff