phongdtd commited on
Commit
3c36852
·
1 Parent(s): 1d7790f

add tokenizer

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. vocab.json +1 -1
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ checkpoint-*/
vocab.json CHANGED
@@ -1 +1 @@
1
- {"": 0, "a": 1, "": 2, "": 3, "á": 4, "": 5, "b": 6, "": 7, "đ": 8, "g": 9, "p": 10, "s": 11, "": 12, "": 13, "": 14, "": 15, "": 16, "ò": 17, "à": 18, "": 19, "â": 20, "v": 21, "ă": 22, "": 23, "o": 24, "j": 25, "": 26, "": 27, "": 28, "ĩ": 29, "t": 30, "ó": 31, "ù": 32, "": 33, "h": 34, "ơ": 35, "": 36, "": 37, "l": 38, "": 39, "": 40, "": 41, "n": 42, "q": 43, "": 44, "r": 45, "x": 46, "õ": 47, "": 48, "ô": 49, "": 50, "ê": 51, "": 52, "ã": 53, "i": 54, "ũ": 55, "": 56, "": 57, "y": 58, "": 59, "é": 60, "": 61, "ư": 62, "": 63, "": 64, "m": 66, "ế": 67, "": 68, "": 69, "e": 70, "w": 71, "c": 72, "ì": 73, "": 74, "": 75, "z": 76, "k": 77, "è": 78, "í": 79, "": 80, "ý": 81, "ú": 82, "": 83, "": 84, "u": 85, "": 86, "": 87, "d": 88, "f": 89, "": 90, "": 91, "": 92, "": 93, "|": 65, "[UNK]": 94, "[PAD]": 95}
 
1
+ {"": 0, "đ": 1, "y": 2, "": 3, "ù": 4, "ĩ": 5, "h": 6, "": 7, "e": 8, "è": 9, "ú": 10, "": 11, "": 12, "": 13, "é": 14, "": 15, "ă": 16, "u": 17, "": 18, "x": 19, "ô": 20, "": 21, "": 22, "": 23, "": 24, "": 25, "": 26, "": 27, "": 28, "g": 29, "": 30, "ê": 31, "k": 32, "": 33, "": 34, "c": 35, "": 36, "s": 37, "b": 38, "": 39, "v": 40, "": 41, "â": 42, "": 43, "w": 44, "": 45, "": 46, "": 47, "": 48, "í": 49, "t": 50, "l": 51, "": 52, "ó": 53, "i": 54, "ế": 55, "ý": 56, "a": 57, "": 58, "ư": 59, "": 60, "": 61, "": 62, "": 63, "": 64, "f": 65, "ự": 66, "m": 67, "ì": 68, "": 69, "": 70, "": 71, "ò": 72, "ã": 73, "": 74, "": 75, "": 76, "p": 77, "o": 78, "q": 79, "ơ": 81, "ũ": 82, "": 83, "n": 84, "õ": 85, "á": 86, "d": 87, "à": 88, "r": 89, "": 90, "": 91, "z": 92, "j": 93, "|": 80, "[UNK]": 94, "[PAD]": 95}