Upload tokenizer
Browse files- tokenization_dart.py +8 -1
tokenization_dart.py
CHANGED
@@ -15,6 +15,12 @@ VOCAB_FILES_NAMES = {
|
|
15 |
"tag_category": "tag_category.json",
|
16 |
}
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
@dataclass
|
20 |
class Category:
|
@@ -63,6 +69,7 @@ class DartTokenizer(PreTrainedTokenizerFast):
|
|
63 |
"""Dart tokenizer"""
|
64 |
|
65 |
vocab_files_names = VOCAB_FILES_NAMES
|
|
|
66 |
|
67 |
def __init__(self, tag_category, **kwargs):
|
68 |
super().__init__(**kwargs)
|
@@ -137,7 +144,7 @@ class DartTokenizer(PreTrainedTokenizerFast):
|
|
137 |
input_ids: List[int],
|
138 |
category_mask: Optional[Dict[str, np.ndarray]] = None,
|
139 |
) -> Tuple[np.ndarray, Dict[str, np.ndarray]]:
|
140 |
-
"""Get the next token's vocab mask
|
141 |
|
142 |
if category_mask == None:
|
143 |
category_mask = self.category_mask
|
|
|
15 |
"tag_category": "tag_category.json",
|
16 |
}
|
17 |
|
18 |
+
PRETRAINED_VOCAB_FILES_MAP = {
|
19 |
+
"tag_category": {
|
20 |
+
"p1atdev/tokenizer_test_1": "https://huggingface.co/p1atdev/tokenizer_test_1/resolve/main/tag_category.json"
|
21 |
+
}
|
22 |
+
}
|
23 |
+
|
24 |
|
25 |
@dataclass
|
26 |
class Category:
|
|
|
69 |
"""Dart tokenizer"""
|
70 |
|
71 |
vocab_files_names = VOCAB_FILES_NAMES
|
72 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
73 |
|
74 |
def __init__(self, tag_category, **kwargs):
|
75 |
super().__init__(**kwargs)
|
|
|
144 |
input_ids: List[int],
|
145 |
category_mask: Optional[Dict[str, np.ndarray]] = None,
|
146 |
) -> Tuple[np.ndarray, Dict[str, np.ndarray]]:
|
147 |
+
"""Get the next token's vocab mask and a category mask"""
|
148 |
|
149 |
if category_mask == None:
|
150 |
category_mask = self.category_mask
|