End of training

Browse files

Files changed (14) hide show

.gitattributes +1 -0
.gitignore +2 -0
README.md +62 -0
__init__.py +1 -0
added_tokens.json +24 -0
config.json +205 -0
configuration_unmasking_qwen.py +5 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_unmasking_qwen.py +1051 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +208 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ step_*
2	+ epoch_*

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+language:
+- en
+tags:
+- token-classification
+- ner
+- pytorch
+- custom-model
+library_name: transformers
+---
+# UnmaskingQwen3 for Token Classification
+This model is a fine-tuned version of a custom UnmaskingQwen3ForTokenClassification model for token classification tasks.
+## Model Details
+- **Model Type**: Custom UnmaskingQwen3ForTokenClassification
+- **Task**: Token Classification (NER/POS/Chunking)
+- **Training Framework**: Transformers + Accelerate
+## Usage
+```python
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+# Load the model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("your-username/your-model-name", trust_remote_code=True)
+model = AutoModelForTokenClassification.from_pretrained("your-username/your-model-name", trust_remote_code=True)
+# Use for inference
+inputs = tokenizer(["Your text here"], return_tensors="pt", is_split_into_words=False)
+outputs = model(**inputs)
+predictions = outputs.logits.argmax(dim=-1)
+```
+## Training Details
+- **Training Data**: ['automated-analytics/ai4privacy-pii-masking-en-v1-ner', 'automated-analytics/gretel-pii-masking-en-v1-ner']
+- **Learning Rate**: 5e-05
+- **Batch Size**: 128
+- **Epochs**: 3
+- **Max Length**: 128
+## Evaluation Results
+### Overall Metrics
+ - Precision: 0.0000
+ - Recall: 0.0000
+ - F1 Score: 0.0000
+ - Accuracy: 0.0000
+### Entity-level Performance
+| Entity Type | Precision | Recall | F1-Score | Support |
+| ----------- | --------- | ------ | -------- | ------- |
+## Important Note
+This model uses a custom model class. Make sure to use `trust_remote_code=True` when loading the model.

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .modeling_unmasking_qwen import UnmaskingQwen2ForTokenClassification, UnmaskingQwen3ForTokenClassification

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

config.json ADDED Viewed

	@@ -0,0 +1,205 @@

+{
+  "architectures": [
+    "UnmaskingQwen2ForTokenClassification"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "id2label": {
+    "0": "O",
+    "1": "B-medical_record_number",
+    "2": "I-medical_record_number",
+    "3": "B-date_of_birth",
+    "4": "I-date_of_birth",
+    "5": "B-ssn",
+    "6": "I-ssn",
+    "7": "B-date",
+    "8": "I-date",
+    "9": "B-first_name",
+    "10": "I-first_name",
+    "11": "B-email",
+    "12": "I-email",
+    "13": "B-last_name",
+    "14": "I-last_name",
+    "15": "B-customer_id",
+    "16": "I-customer_id",
+    "17": "B-employee_id",
+    "18": "I-employee_id",
+    "19": "B-name",
+    "20": "I-name",
+    "21": "B-street_address",
+    "22": "I-street_address",
+    "23": "B-phone_number",
+    "24": "I-phone_number",
+    "25": "B-ipv4",
+    "26": "I-ipv4",
+    "27": "B-credit_card_number",
+    "28": "I-credit_card_number",
+    "29": "B-license_plate",
+    "30": "I-license_plate",
+    "31": "B-address",
+    "32": "I-address",
+    "33": "B-user_name",
+    "34": "I-user_name",
+    "35": "B-device_identifier",
+    "36": "I-device_identifier",
+    "37": "B-bank_routing_number",
+    "38": "I-bank_routing_number",
+    "39": "B-date_time",
+    "40": "I-date_time",
+    "41": "B-company_name",
+    "42": "I-company_name",
+    "43": "B-unique_identifier",
+    "44": "I-unique_identifier",
+    "45": "B-biometric_identifier",
+    "46": "I-biometric_identifier",
+    "47": "B-account_number",
+    "48": "I-account_number",
+    "49": "B-city",
+    "50": "I-city",
+    "51": "B-certificate_license_number",
+    "52": "I-certificate_license_number",
+    "53": "B-time",
+    "54": "I-time",
+    "55": "B-postcode",
+    "56": "I-postcode",
+    "57": "B-vehicle_identifier",
+    "58": "I-vehicle_identifier",
+    "59": "B-coordinate",
+    "60": "I-coordinate",
+    "61": "B-country",
+    "62": "I-country",
+    "63": "B-api_key",
+    "64": "I-api_key",
+    "65": "B-ipv6",
+    "66": "I-ipv6",
+    "67": "B-password",
+    "68": "I-password",
+    "69": "B-health_plan_beneficiary_number",
+    "70": "I-health_plan_beneficiary_number",
+    "71": "B-national_id",
+    "72": "I-national_id",
+    "73": "B-tax_id",
+    "74": "I-tax_id",
+    "75": "B-url",
+    "76": "I-url",
+    "77": "B-state",
+    "78": "I-state",
+    "79": "B-swift_bic",
+    "80": "I-swift_bic",
+    "81": "B-cvv",
+    "82": "I-cvv",
+    "83": "B-pin",
+    "84": "I-pin"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "label2id": {
+    "B-account_number": 47,
+    "B-address": 31,
+    "B-api_key": 63,
+    "B-bank_routing_number": 37,
+    "B-biometric_identifier": 45,
+    "B-certificate_license_number": 51,
+    "B-city": 49,
+    "B-company_name": 41,
+    "B-coordinate": 59,
+    "B-country": 61,
+    "B-credit_card_number": 27,
+    "B-customer_id": 15,
+    "B-cvv": 81,
+    "B-date": 7,
+    "B-date_of_birth": 3,
+    "B-date_time": 39,
+    "B-device_identifier": 35,
+    "B-email": 11,
+    "B-employee_id": 17,
+    "B-first_name": 9,
+    "B-health_plan_beneficiary_number": 69,
+    "B-ipv4": 25,
+    "B-ipv6": 65,
+    "B-last_name": 13,
+    "B-license_plate": 29,
+    "B-medical_record_number": 1,
+    "B-name": 19,
+    "B-national_id": 71,
+    "B-password": 67,
+    "B-phone_number": 23,
+    "B-pin": 83,
+    "B-postcode": 55,
+    "B-ssn": 5,
+    "B-state": 77,
+    "B-street_address": 21,
+    "B-swift_bic": 79,
+    "B-tax_id": 73,
+    "B-time": 53,
+    "B-unique_identifier": 43,
+    "B-url": 75,
+    "B-user_name": 33,
+    "B-vehicle_identifier": 57,
+    "I-account_number": 48,
+    "I-address": 32,
+    "I-api_key": 64,
+    "I-bank_routing_number": 38,
+    "I-biometric_identifier": 46,
+    "I-certificate_license_number": 52,
+    "I-city": 50,
+    "I-company_name": 42,
+    "I-coordinate": 60,
+    "I-country": 62,
+    "I-credit_card_number": 28,
+    "I-customer_id": 16,
+    "I-cvv": 82,
+    "I-date": 8,
+    "I-date_of_birth": 4,
+    "I-date_time": 40,
+    "I-device_identifier": 36,
+    "I-email": 12,
+    "I-employee_id": 18,
+    "I-first_name": 10,
+    "I-health_plan_beneficiary_number": 70,
+    "I-ipv4": 26,
+    "I-ipv6": 66,
+    "I-last_name": 14,
+    "I-license_plate": 30,
+    "I-medical_record_number": 2,
+    "I-name": 20,
+    "I-national_id": 72,
+    "I-password": 68,
+    "I-phone_number": 24,
+    "I-pin": 84,
+    "I-postcode": 56,
+    "I-ssn": 6,
+    "I-state": 78,
+    "I-street_address": 22,
+    "I-swift_bic": 80,
+    "I-tax_id": 74,
+    "I-time": 54,
+    "I-unique_identifier": 44,
+    "I-url": 76,
+    "I-user_name": 34,
+    "I-vehicle_identifier": 58,
+    "O": 0
+  },
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936,
+  "auto_map": {
+    "AutoModelForTokenClassification": "modeling_unmasking_qwen.UnmaskingQwen3ForTokenClassification"
+  }
+}

configuration_unmasking_qwen.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from transformers.configuration_utils import PretrainedConfig
+class UnmaskingQwenConfig(PretrainedConfig):
+    model_type = "unmasking_qwen"

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a794f6e3e7c5ad34ec15ddb9fd27c18ee491a588c9981fbe23454b24fddc97fa
+size 1976468620

modeling_unmasking_qwen.py ADDED Viewed

	@@ -0,0 +1,1051 @@

+# Unmasking Qwen Token Classification Models
+# Automatically generated file for model use with trust_remote_code=True
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import TokenClassifierOutput
+from typing import Optional, Tuple, Union, List, Dict, Callable
+import torch
+import torch.nn as nn
+from transformers.modeling_outputs import TokenClassifierOutput
+from transformers.models.bert import (
+    BertConfig, BertModel, BertPreTrainedModel
+)
+from transformers.models.roberta import (
+    RobertaConfig, RobertaModel, RobertaPreTrainedModel
+)
+from transformers.models.deberta_v2 import (
+    DebertaV2Config, DebertaV2Model, DebertaV2PreTrainedModel
+)
+from transformers.models.modernbert.modeling_modernbert import (
+    ModernBertConfig, ModernBertModel, ModernBertPreTrainedModel, ModernBertPredictionHead
+)
+from transformers import Qwen2Config
+from transformers.modeling_outputs import TokenClassifierOutput, BaseModelOutputWithPast
+from transformers.cache_utils import Cache
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.processing_utils import Unpack
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2PreTrainedModel,
+    Qwen2Model,
+    SlidingWindowCache,
+    StaticCache
+)
+from transformers.models.qwen3.modeling_qwen3 import (
+    Qwen3PreTrainedModel,
+    Qwen3Config,
+    Qwen3Model,
+    Qwen3RMSNorm,
+    Qwen3DecoderLayer,
+    Qwen3Attention,
+    BaseModelOutputWithPast,
+    TokenClassifierOutput,
+    Cache,
+    FlashAttentionKwargs,
+    Unpack,
+    Qwen3RotaryEmbedding,
+    Qwen3MLP,
+    apply_rotary_pos_emb,
+    can_return_tuple,
+    eager_attention_forward
+)
+def fixed_cross_entropy(
+    source: torch.Tensor,
+    target: torch.Tensor,
+    num_items_in_batch: Optional[int] = None,
+    ignore_index: int = -100,
+    **kwargs,
+) -> torch.Tensor:
+    reduction = "sum" if num_items_in_batch is not None else "mean"
+    loss = nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction=reduction)
+    if reduction == "sum":
+        if not isinstance(num_items_in_batch, torch.Tensor):
+            num_items_in_batch = torch.tensor(num_items_in_batch, device=loss.device, dtype=loss.dtype)
+        elif num_items_in_batch.device != loss.device:
+            num_items_in_batch = num_items_in_batch.to(loss.device)
+        loss = loss / num_items_in_batch
+    return loss
+class BertForTokenClassification(BertPreTrainedModel):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_items_in_batch: Optional[torch.Tensor] = None,
+        ignore_index: int = -100,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            logits = logits.view(-1, self.num_labels)
+            labels = labels.view(-1).to(logits.device)
+            logits = logits.float()
+            loss = fixed_cross_entropy(logits, labels, num_items_in_batch, ignore_index)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class CRF(nn.Module):
+    """條件隨機場(CRF)層，基於更穩定的實現"""
+    def __init__(self, num_labels: int):
+        super().__init__()
+        self.num_labels = num_labels
+        # 轉移矩陣和起始/結束轉移參數
+        self.start_transitions = nn.Parameter(torch.empty(num_labels))
+        self.end_transitions = nn.Parameter(torch.empty(num_labels))
+        self.transitions = nn.Parameter(torch.empty(num_labels, num_labels))
+        # 用均勻分布初始化參數
+        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.transitions, -0.1, 0.1)
+    def _compute_log_denominator(self, features: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """計算配分函數的對數(log of the partition function)"""
+        seq_len, batch_size, _ = features.shape
+        # 初始化得分為起始轉移得分 + 第一個時間步的特征
+        log_score = self.start_transitions + features[0]  # [batch_size, num_labels]
+        # 逐時間步計算得分
+        for i in range(1, seq_len):
+            # 計算所有可能的轉移得分：前一時間步得分 + 轉移得分 + 當前時間步特征
+            # [batch_size, num_labels, 1] + [num_labels, num_labels] + [batch_size, 1, num_labels]
+            # -> [batch_size, num_labels, num_labels]
+            next_score = (
+                log_score.unsqueeze(2) +        # [batch_size, num_labels, 1]
+                self.transitions +               # [num_labels, num_labels]
+                features[i].unsqueeze(1)         # [batch_size, 1, num_labels]
+            )
+            # 對所有可能的前一個標籤取logsumexp
+            next_score = torch.logsumexp(next_score, dim=1)
+            # 根據mask更新得分
+            log_score = torch.where(mask[i].unsqueeze(1), next_score, log_score)
+        # 加上到結束標籤的轉移得分
+        log_score += self.end_transitions
+        # 對所有可能的最終標籤取logsumexp
+        return torch.logsumexp(log_score, dim=1)
+    def _compute_log_numerator(self, features: torch.Tensor, labels: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """計算給定標籤序列的得分"""
+        seq_len, batch_size, _ = features.shape
+        # 初始化得分
+        score = self.start_transitions[labels[0]] + features[0, torch.arange(batch_size), labels[0]]
+        # 逐時間步累加得分
+        for i in range(1, seq_len):
+            # 計算轉移得分和發射得分
+            score += (
+                self.transitions[labels[i-1], labels[i]] +            # 轉移得分
+                features[i, torch.arange(batch_size), labels[i]]      # 發射得分
+            ) * mask[i]  # 只對有效位置計算
+        # 計算序列長度（減去1是因為索引從0開始）
+        seq_lens = mask.sum(dim=0) - 1
+        # 獲取每個序列的最後一個有效標籤
+        last_tags = labels[seq_lens.long(), torch.arange(batch_size)]
+        # 加上到結束標籤的轉移得分
+        score += self.end_transitions[last_tags]
+        return score
+    def forward(self, emissions: torch.Tensor, tags: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        計算CRF負對數似然損失
+        參數:
+            emissions: (seq_len, batch_size, num_labels) 發射得分
+            tags: (seq_len, batch_size) 真實標籤
+            mask: (seq_len, batch_size) 用於處理變長序列的遮罩
+        返回:
+            CRF負對數似然損失
+        """
+        # 計算分子(numerator)和分母(denominator)的對數
+        log_numerator = self._compute_log_numerator(emissions, tags, mask)
+        log_denominator = self._compute_log_denominator(emissions, mask)
+        # 損失是分母減分子
+        loss = torch.mean(log_denominator - log_numerator)
+        return loss
+    def _viterbi_decode(self, features: torch.Tensor, mask: torch.Tensor) -> List[List[int]]:
+        """Viterbi算法解碼，找出最可能的標籤序列"""
+        seq_len, batch_size, _ = features.shape
+        # 初始化Viterbi變量
+        log_score = self.start_transitions + features[0]  # [batch_size, num_labels]
+        backpointers = torch.zeros((seq_len, batch_size, self.num_labels), dtype=torch.long, device=features.device)
+        # 逐時間步填充
+        for i in range(1, seq_len):
+            # 計算所有可能的轉移得分
+            next_score = log_score.unsqueeze(2) + self.transitions + features[i].unsqueeze(1)
+            # 找出每個當前標籤的最佳前一個標籤
+            next_score, indices = next_score.max(dim=1)
+            # 記錄回溯指針
+            backpointers[i] = indices
+            # 根據mask更新得分
+            log_score = torch.where(mask[i].unsqueeze(1), next_score, log_score)
+        # 加上到結束標籤的轉移得分
+        log_score += self.end_transitions
+        # 找出每個序列的最後一個標籤
+        seq_lens = mask.sum(dim=0).long() - 1  # 序列長度
+        # 回溯獲取最佳路徑
+        best_paths = []
+        for seq_idx in range(batch_size):
+            # 找出得分最高的最終標籤
+            best_label = torch.argmax(log_score[seq_idx]).item()
+            best_path = [best_label]
+            # 從後向前回溯
+            for i in range(seq_lens[seq_idx], 0, -1):
+                best_label = backpointers[i, seq_idx, best_label].item()
+                best_path.insert(0, best_label)
+            best_paths.append(best_path)
+        return best_paths
+    def decode(self, emissions: torch.Tensor, mask: torch.Tensor) -> List[List[int]]:
+        """使用Viterbi解碼找出最可能的標籤序列"""
+        # 確保mask是bool類型
+        if mask.dtype != torch.bool:
+            mask = mask.bool()
+        with torch.no_grad():
+            return self._viterbi_decode(emissions, mask)
+class BertCRFForTokenClassification(BertPreTrainedModel):
+    """BERT模型與CRF層結合用於token分類"""
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        # BERT層
+        self.bert = BertModel(config, add_pooling_layer=False)
+        # Dropout和分類器
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # CRF層
+        self.crf = CRF(config.num_labels)
+        # 初始化權重
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        ignore_index: int = -100,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], Dict[str, torch.Tensor]]:
+        """
+        使用CRF進行序列標注的前向傳播
+        參數:
+            labels: 標籤序列，用於計算損失
+            ignore_index: 忽略的標籤值，通常為-100
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # BERT前向傳播
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        # 獲取發射分數
+        logits = self.classifier(sequence_output)  # [batch_size, seq_len, num_labels]
+        loss = None
+        if labels is not None:
+            # 準備CRF所需的輸入格式
+            # 交換維度：[batch_size, seq_len, num_labels] -> [seq_len, batch_size, num_labels]
+            emissions = logits.transpose(0, 1)
+            # 交換維度：[batch_size, seq_len] -> [seq_len, batch_size]
+            if attention_mask is not None:
+                attention_mask_t = attention_mask.transpose(0, 1).bool()
+            else:
+                attention_mask_t = torch.ones(emissions.shape[:2], dtype=torch.bool, device=emissions.device)
+            # 處理ignore_index
+            if ignore_index is not None:
+                labels_mask = (labels != ignore_index)
+                attention_mask_t = attention_mask_t & labels_mask.transpose(0, 1)
+                # 創建一個不包含ignore_index的標籤tensor
+                crf_labels = labels.clone()
+                crf_labels[~labels_mask] = 0  # 將ignore的位置臨時設為0，��免其影響CRF計算
+                crf_labels_t = crf_labels.transpose(0, 1)
+            else:
+                crf_labels_t = labels.transpose(0, 1)
+            # 計算CRF損失
+            loss = self.crf(
+                emissions=emissions,
+                tags=crf_labels_t,
+                mask=attention_mask_t
+            )
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def decode(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> List[List[int]]:
+        """
+        解碼最可能的標籤序列
+        """
+        # 不計算梯度
+        with torch.no_grad():
+            # BERT前向傳播
+            outputs = self.bert(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                return_dict=True,
+                **kwargs,
+            )
+            sequence_output = outputs[0]
+            sequence_output = self.dropout(sequence_output)
+            # 獲取發射分數
+            logits = self.classifier(sequence_output)  # [batch_size, seq_len, num_labels]
+            # 交換維度：[batch_size, seq_len, num_labels] -> [seq_len, batch_size, num_labels]
+            emissions = logits.transpose(0, 1)
+            # 準備遮罩
+            if attention_mask is not None:
+                mask = attention_mask.transpose(0, 1).bool()
+            else:
+                mask = torch.ones(emissions.shape[:2], dtype=torch.bool, device=emissions.device)
+            # 使用Viterbi算法解碼
+            best_tags = self.crf.decode(emissions, mask)
+            return best_tags
+class RobertaForTokenClassification(RobertaPreTrainedModel):
+    def __init__(self, config: RobertaConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_items_in_batch: Optional[torch.Tensor] = None,
+        ignore_index: int = -100,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.view(-1, self.num_labels)
+            labels = labels.view(-1).to(logits.device)
+            logits = logits.float()
+            loss = fixed_cross_entropy(logits, labels, num_items_in_batch, ignore_index)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
+    def __init__(self, config: DebertaV2Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.deberta = DebertaV2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_items_in_batch: Optional[torch.Tensor] = None,
+        ignore_index: int = -100,
+        **kwargs,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.view(-1, self.num_labels)
+            labels = labels.view(-1).to(logits.device)
+            logits = logits.float()
+            loss = fixed_cross_entropy(logits, labels, num_items_in_batch, ignore_index)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+class ModernBertForTokenClassification(ModernBertPreTrainedModel):
+    def __init__(self, config: ModernBertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = ModernBertModel(config)
+        self.head = ModernBertPredictionHead(config)
+        self.drop = torch.nn.Dropout(config.classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        sliding_window_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        indices: Optional[torch.Tensor] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        seq_len: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_items_in_batch: Optional[torch.Tensor] = None,
+        ignore_index: int = -100,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
+            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
+            far-away tokens in the local attention layers when not using Flash Attention.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
+            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
+        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
+            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
+        max_seqlen (`int`, *optional*):
+            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
+        batch_size (`int`, *optional*):
+            Batch size of the input sequences. Used to pad the output tensors.
+        seq_len (`int`, *optional*):
+            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        self._maybe_set_compile()
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            sliding_window_mask=sliding_window_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            indices=indices,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = outputs[0]
+        last_hidden_state = self.head(last_hidden_state)
+        last_hidden_state = self.drop(last_hidden_state)
+        logits = self.classifier(last_hidden_state)
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.view(-1, self.num_labels)
+            labels = labels.view(-1).to(logits.device)
+            logits = logits.float()
+            loss = fixed_cross_entropy(logits, labels, num_items_in_batch, ignore_index)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class UnmaskingQwen3Attention(Qwen3Attention):
+    """Multi-headed attention without causal mask for bidirectional attention"""
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # Use eager attention as default
+        attention_interface: Callable = eager_attention_forward
+        # Remove causal mask by setting attention_mask to None or creating a non-causal mask
+        # For bidirectional attention, we don't want any masking except padding
+        if attention_mask is not None and 0.0 in attention_mask:
+            # Keep only padding mask if it exists, remove causal part
+            # This allows tokens to attend to future tokens
+            pass
+        else:
+            # If there's no padding, we can set attention_mask to None for full attention
+            attention_mask = None
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class UnmaskingQwen3DecoderLayer(Qwen3DecoderLayer):
+    def __init__(self, config: Qwen3Config, layer_idx: int):
+        super(Qwen3DecoderLayer, self).__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = UnmaskingQwen3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Qwen3MLP(config)
+        self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+class UnmaskingQwen3Model(Qwen3Model):
+    def __init__(self, config: Qwen3Config):
+        super(Qwen3PreTrainedModel, self).__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [UnmaskingQwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        # Override the causal mask creation to create a non-causal mask
+        # This allows bidirectional attention
+        if attention_mask is None:
+            # If no attention mask is provided, return None to allow full attention
+            return None
+        # If attention_mask is provided, it's likely for padding
+        # Convert it to the right format but without the causal constraint
+        dtype = input_tensor.dtype
+        min_dtype = torch.finfo(dtype).min
+        batch_size = input_tensor.shape[0]
+        sequence_length = input_tensor.shape[1]
+        if isinstance(attention_mask, torch.Tensor) and attention_mask.dim() == 2:
+            # Convert 2D padding mask to 4D attention mask
+            expanded_attn_mask = attention_mask[:, None, None, :]
+            expanded_attn_mask = expanded_attn_mask.to(dtype=dtype)
+            expanded_attn_mask = (1.0 - expanded_attn_mask) * min_dtype
+            return expanded_attn_mask
+        # If it's already 4D, return as is
+        return attention_mask
+class UnmaskingQwen3ForTokenClassification(Qwen3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = UnmaskingQwen3Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> TokenClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs.last_hidden_state
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class UnmaskingQwen2Model(Qwen2Model):
+    """
+    UnmaskingQwen2Model is a modified version of Qwen2Model that removes the causal mask,
+    allowing bidirectional attention similar to BERT-like models.
+    """
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        """
+        Override the causal mask creation to create a non-causal (bidirectional) mask.
+        This allows each token to attend to all tokens in the sequence.
+        """
+        # For flash attention, just return None or the padding mask
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        # For flex attention, keep the same behavior but without causality
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                # We don't convert to causal mask here
+                return attention_mask
+            return attention_mask
+        # For other attention implementations, create a non-causal mask
+        batch_size = input_tensor.shape[0]
+        sequence_length = input_tensor.shape[1]
+        dtype = input_tensor.dtype
+        # For SlidingWindowCache or StaticCache
+        if isinstance(past_key_values, (SlidingWindowCache, StaticCache)):
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            # For DynamicCache or no cache
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_key_values.get_seq_length() + sequence_length + 1
+                if past_key_values is not None
+                else sequence_length
+            )
+        # Create a non-causal mask (all zeros, allowing full attention)
+        # Instead of using min_dtype to mask out future tokens, we use zeros to allow attention to all positions
+        non_causal_mask = torch.zeros(
+            (batch_size, 1, sequence_length, target_length),
+            dtype=dtype,
+            device=input_tensor.device,
+        )
+        # If there's a padding attention mask, apply it
+        if attention_mask is not None:
+            if attention_mask.dim() == 2:
+                # Convert 2D attention mask to 4D
+                expanded_mask = attention_mask[:, None, None, :].expand(
+                    batch_size, 1, sequence_length, attention_mask.shape[-1]
+                ).to(non_causal_mask.device)
+                # Apply padding mask (0 for tokens to attend to, large negative for padded positions)
+                min_dtype = torch.finfo(dtype).min
+                padding_mask = expanded_mask == 0
+                non_causal_mask = non_causal_mask.masked_fill(padding_mask, min_dtype)
+            elif attention_mask.dim() == 4:
+                # If already 4D, use as is
+                non_causal_mask = attention_mask
+        return non_causal_mask
+class UnmaskingQwen2ForTokenClassification(Qwen2PreTrainedModel):
+    """
+    Qwen2 model with a token classification head on top, but with bidirectional attention.
+    This is achieved by using the UnmaskingQwen2Model which removes the causal mask.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        # Use the UnmaskingQwen2Model instead of the standard Qwen2Model
+        self.model = UnmaskingQwen2Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> TokenClassifierOutput:
+        """
+        Forward pass for token classification with bidirectional attention.
+        Args:
+            input_ids: Input token IDs
+            attention_mask: Attention mask
+            position_ids: Position IDs
+            past_key_values: Past key values for efficient generation
+            inputs_embeds: Pre-computed input embeddings
+            labels: Token classification labels
+            use_cache: Whether to use cache for efficient generation
+            output_attentions: Whether to output attention weights
+            output_hidden_states: Whether to output hidden states
+            flash_attn_kwargs: Additional arguments for flash attention
+        Returns:
+            TokenClassifierOutput with loss, logits, and optional hidden states and attentions
+        """
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            **flash_attn_kwargs,
+        )
+        sequence_output = outputs.last_hidden_state
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e72410ddfe59d4ec8aeea923685710c76492112d74f5d127b86e2d08d65a3a4
+size 11422174

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff