Lakoc
/

cz_ec_bpe1000

@@ -5,7 +5,7 @@
   "added_tokens": [
     {
       "id": 0,
-      "content": "(BOS)",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
@@ -14,7 +14,7 @@
     },
     {
       "id": 1,
-      "content": "(EOS)",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
@@ -23,16 +23,16 @@
     },
     {
       "id": 2,
-      "content": "(UNK)",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
-      "normalized": true,
-      "special": false
     },
     {
       "id": 3,
-      "content": "(PAD)",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
@@ -41,7 +41,7 @@
     },
     {
       "id": 4,
-      "content": "(MASK)",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
@@ -49,7 +49,7 @@
       "special": true
     },
     {
-      "id": 1000,
       "content": "(LNG)",
       "single_word": false,
       "lstrip": false,
@@ -58,7 +58,16 @@
       "special": false
     },
     {
-      "id": 1001,
       "content": "(SPN)",
       "single_word": false,
       "lstrip": false,
@@ -85,7 +94,7 @@
       },
       {
         "SpecialToken": {
-          "id": "(EOS)",
           "type_id": 0
         }
       }
@@ -99,7 +108,7 @@
       },
       {
         "SpecialToken": {
-          "id": "(EOS)",
           "type_id": 0
         }
       },
@@ -111,28 +120,28 @@
       },
       {
         "SpecialToken": {
-          "id": "(EOS)",
           "type_id": 1
         }
       }
     ],
     "special_tokens": {
-      "(BOS)": {
-        "id": "(BOS)",
         "ids": [
           0
         ],
         "tokens": [
-          "(BOS)"
         ]
       },
-      "(EOS)": {
-        "id": "(EOS)",
         "ids": [
           1
         ],
         "tokens": [
-          "(EOS)"
         ]
       }
     }
@@ -146,17 +155,17 @@
   "model": {
     "type": "BPE",
     "dropout": null,
-    "unk_token": "(UNK)",
     "continuing_subword_prefix": null,
     "end_of_word_suffix": null,
     "fuse_unk": false,
     "byte_fallback": false,
     "vocab": {
-      "(BOS)": 0,
-      "(EOS)": 1,
-      "(UNK)": 2,
-      "(PAD)": 3,
-      "(MASK)": 4,
       "!": 5,
       "%": 6,
       "'": 7,
@@ -1148,10 +1157,7 @@
       "Ġznamen": 993,
       "vÃ½": 994,
       "ovala": 995,
-      "rych": 996,
-      "ckÃ¡": 997,
-      "ĠJ": 998,
-      "Ġdisku": 999
     },
     "merges": [
       "Ã Ń",
@@ -2011,10 +2017,7 @@
       "Ġzna men",
       "v Ã½",
       "ova la",
-      "ry ch",
-      "ck Ã¡",
-      "Ġ J",
-      "Ġdi sku"
     ]
   }
 }

   "added_tokens": [
     {
       "id": 0,
+      "content": "([bos])",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
     },
     {
       "id": 1,
+      "content": "([eos])",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
     },
     {
       "id": 2,
+      "content": "([unk])",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
+      "normalized": false,
+      "special": true
     },
     {
       "id": 3,
+      "content": "([pad])",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
     },
     {
       "id": 4,
+      "content": "([mask])",
       "single_word": false,
       "lstrip": false,
       "rstrip": false,
       "special": true
     },
     {
+      "id": 997,
       "content": "(LNG)",
       "single_word": false,
       "lstrip": false,
       "special": false
     },
     {
+      "id": 998,
+      "content": "(UNK)",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 999,
       "content": "(SPN)",
       "single_word": false,
       "lstrip": false,
       },
       {
         "SpecialToken": {
+          "id": "([eos])",
           "type_id": 0
         }
       }
       },
       {
         "SpecialToken": {
+          "id": "([eos])",
           "type_id": 0
         }
       },
       },
       {
         "SpecialToken": {
+          "id": "([eos])",
           "type_id": 1
         }
       }
     ],
     "special_tokens": {
+      "([bos])": {
+        "id": "([bos])",
         "ids": [
           0
         ],
         "tokens": [
+          "([bos])"
         ]
       },
+      "([eos])": {
+        "id": "([eos])",
         "ids": [
           1
         ],
         "tokens": [
+          "([eos])"
         ]
       }
     }
   "model": {
     "type": "BPE",
     "dropout": null,
+    "unk_token": "([unk])",
     "continuing_subword_prefix": null,
     "end_of_word_suffix": null,
     "fuse_unk": false,
     "byte_fallback": false,
     "vocab": {
+      "([bos])": 0,
+      "([eos])": 1,
+      "([unk])": 2,
+      "([pad])": 3,
+      "([mask])": 4,
       "!": 5,
       "%": 6,
       "'": 7,
       "Ġznamen": 993,
       "vÃ½": 994,
       "ovala": 995,
+      "rych": 996
     },
     "merges": [
       "Ã Ń",
       "Ġzna men",
       "v Ã½",
       "ova la",
+      "ry ch"
     ]
   }
 }

tokenizer_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "added_tokens_decoder": {
     "0": {
-      "content": "(BOS)",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -9,7 +9,7 @@
       "special": true
     },
     "1": {
-      "content": "(EOS)",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -17,15 +17,15 @@
       "special": true
     },
     "2": {
-      "content": "(UNK)",
       "lstrip": false,
-      "normalized": true,
       "rstrip": false,
       "single_word": false,
-      "special": false
     },
     "3": {
-      "content": "(PAD)",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -33,14 +33,14 @@
       "special": true
     },
     "4": {
-      "content": "(MASK)",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "1000": {
       "content": "(LNG)",
       "lstrip": false,
       "normalized": true,
@@ -48,7 +48,15 @@
       "single_word": false,
       "special": false
     },
-    "1001": {
       "content": "(SPN)",
       "lstrip": false,
       "normalized": true,
@@ -57,12 +65,12 @@
       "special": false
     }
   },
-  "bos_token": "(BOS)",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "(EOS)",
-  "mask_token": "(MASK)",
   "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "(PAD)",
   "tokenizer_class": "PreTrainedTokenizerFast",
-  "unk_token": "(UNK)"
 }

 {
   "added_tokens_decoder": {
     "0": {
+      "content": "([bos])",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "special": true
     },
     "1": {
+      "content": "([eos])",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "special": true
     },
     "2": {
+      "content": "([unk])",
       "lstrip": false,
+      "normalized": false,
       "rstrip": false,
       "single_word": false,
+      "special": true
     },
     "3": {
+      "content": "([pad])",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "special": true
     },
     "4": {
+      "content": "([mask])",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "997": {
       "content": "(LNG)",
       "lstrip": false,
       "normalized": true,
       "single_word": false,
       "special": false
     },
+    "998": {
+      "content": "(UNK)",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "999": {
       "content": "(SPN)",
       "lstrip": false,
       "normalized": true,
       "special": false
     }
   },
+  "bos_token": "([bos])",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "([eos])",
+  "mask_token": "([mask])",
   "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "([pad])",
   "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "([unk])"
 }