Spaces:
Running
Running
update
Browse files- playground_examples.py +138 -113
- playground_util.py +63 -30
- vocab.py +1 -1
playground_examples.py
CHANGED
@@ -1,113 +1,138 @@
|
|
1 |
-
"""
|
2 |
-
|
3 |
-
## characters
|
4 |
-
|
5 |
-
- alphanumeric characters
|
6 |
-
- numeric characters
|
7 |
-
- special characters: A special character is a character that is not an alphabetic or numeric character.
|
8 |
-
- ASCII control characters
|
9 |
-
- punctuation marks
|
10 |
-
- accent marks
|
11 |
-
- 数学符号
|
12 |
-
- whitespace:
|
13 |
-
- https://en.wikipedia.org/wiki/Whitespace_character
|
14 |
-
- https://emptycharacter.com/
|
15 |
-
|
16 |
-
|
17 |
-
https://www.computerhope.com/jargon/s/specchar.htm
|
18 |
-
"""
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
default_tokenizer_name_1 = "
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
"
|
37 |
-
"
|
38 |
-
"
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
print(
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
["
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
[
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
#
|
91 |
-
(
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
|
3 |
+
## characters
|
4 |
+
|
5 |
+
- alphanumeric characters
|
6 |
+
- numeric characters
|
7 |
+
- special characters: A special character is a character that is not an alphabetic or numeric character.
|
8 |
+
- ASCII control characters
|
9 |
+
- punctuation marks
|
10 |
+
- accent marks
|
11 |
+
- 数学符号
|
12 |
+
- whitespace:
|
13 |
+
- https://en.wikipedia.org/wiki/Whitespace_character
|
14 |
+
- https://emptycharacter.com/
|
15 |
+
|
16 |
+
|
17 |
+
https://www.computerhope.com/jargon/s/specchar.htm
|
18 |
+
"""
|
19 |
+
|
20 |
+
import random
|
21 |
+
from datasets import load_dataset
|
22 |
+
|
23 |
+
default_user_input = """\
|
24 |
+
Replace this text in the input field to see how tokenization works.
|
25 |
+
Buenos días!
|
26 |
+
华为发布Mate60手机。
|
27 |
+
ラグビーワールドカップ2023フランス"""
|
28 |
+
# default_tokenizer_name_1 = "Meta/llama3"
|
29 |
+
# default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
|
30 |
+
default_tokenizer_name_1 = "deepseek-ai/DeepSeek-R1"
|
31 |
+
default_tokenizer_name_2 = "openai/gpt-4o"
|
32 |
+
|
33 |
+
|
34 |
+
def get_sample_input():
|
35 |
+
default_inputs = {
|
36 |
+
"en": "Replace this text in the input field to see how tokenization works.",
|
37 |
+
"zh-Hans": "",
|
38 |
+
"es": "",
|
39 |
+
"de": "",
|
40 |
+
}
|
41 |
+
random.seed(10) # For reproducibility
|
42 |
+
lines = []
|
43 |
+
for lang in default_inputs.keys():
|
44 |
+
dataset = load_dataset("eson/cc100-samples", lang, split="train")
|
45 |
+
print(dataset)
|
46 |
+
print(1)
|
47 |
+
return default_inputs
|
48 |
+
|
49 |
+
|
50 |
+
examples = {
|
51 |
+
"en": [
|
52 |
+
["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
|
53 |
+
[
|
54 |
+
"whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline",
|
55 |
+
"huggyllama/llama-7b",
|
56 |
+
"google-bert/bert-base-cased",
|
57 |
+
], # chatglm 有blank_n, bert丢掉了空格,
|
58 |
+
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
59 |
+
[
|
60 |
+
'punctuation: ,.:/?+=",。!?;【】〔〕〖〗',
|
61 |
+
"google/gemma-7b",
|
62 |
+
"huggyllama/llama-7b",
|
63 |
+
], # llama词典有点小
|
64 |
+
[
|
65 |
+
"symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
|
66 |
+
"baichuan-inc/Baichuan-7B",
|
67 |
+
"huggyllama/llama-7b",
|
68 |
+
],
|
69 |
+
# ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
|
70 |
+
],
|
71 |
+
"zh": [
|
72 |
+
[
|
73 |
+
"空格测试: 2个空格 8个空格",
|
74 |
+
"llama",
|
75 |
+
"chatglm2_6b",
|
76 |
+
], # chatglm 有blank_n,
|
77 |
+
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
78 |
+
[
|
79 |
+
"符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
|
80 |
+
"baichuan_7b",
|
81 |
+
"llama",
|
82 |
+
],
|
83 |
+
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
84 |
+
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
85 |
+
],
|
86 |
+
}
|
87 |
+
|
88 |
+
|
89 |
+
more_examples = [
|
90 |
+
# bert系列
|
91 |
+
(
|
92 |
+
"google-bert/bert-base-cased",
|
93 |
+
"google-bert/bert-base-uncased",
|
94 |
+
"",
|
95 |
+
"",
|
96 |
+
), # # clue VS kplug, bert VS clue
|
97 |
+
("bert-base-cased", "clue", "", "增加了[]()"),
|
98 |
+
("roberta-chinese-clue", "kplug", "", ""),
|
99 |
+
# llama系列 (基于sentencepiece)
|
100 |
+
(
|
101 |
+
"baichuan",
|
102 |
+
"baichuan2",
|
103 |
+
"baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1",
|
104 |
+
),
|
105 |
+
("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
|
106 |
+
("llama", "chinese-llama-2-7b", ""),
|
107 |
+
("llama", "llama3", "扩充词典"),
|
108 |
+
("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
|
109 |
+
# glm系列 (基于sentencepiece)
|
110 |
+
("glm", "chatglm1", ""),
|
111 |
+
("chatglm1", "chatglm2", ""),
|
112 |
+
# gpt2系列
|
113 |
+
("gpt2", "moss", ""),
|
114 |
+
("", "", ""),
|
115 |
+
# openai系列 (tiktoken)
|
116 |
+
("qwen", "gpt_35_turbo", ""),
|
117 |
+
]
|
118 |
+
|
119 |
+
lang = "en"
|
120 |
+
|
121 |
+
example_types = [t[0].split(":")[0] for t in examples[lang]]
|
122 |
+
|
123 |
+
|
124 |
+
def example_fn(example_idx):
|
125 |
+
return examples[lang][example_idx]
|
126 |
+
|
127 |
+
|
128 |
+
def get_more_example():
|
129 |
+
import urllib.parse
|
130 |
+
|
131 |
+
url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
|
132 |
+
for tokenizer1, tokenizer2, text, comment in more_examples:
|
133 |
+
full_url = f"{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}"
|
134 |
+
print(full_url)
|
135 |
+
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
get_more_example()
|
playground_util.py
CHANGED
@@ -6,18 +6,24 @@ from vocab import tokenizer_factory
|
|
6 |
from character_util import iter_vocab
|
7 |
from utils.log_util import logger
|
8 |
from utils.i18n_util import get_lang
|
9 |
-
from playground_examples import
|
|
|
|
|
|
|
|
|
10 |
from functools import lru_cache
|
11 |
|
12 |
|
13 |
@lru_cache
|
14 |
def _tokenize(
|
15 |
-
|
16 |
-
tokenizer_name: str,
|
17 |
-
color_num: int = 5,
|
18 |
-
add_special_token: bool = False
|
19 |
):
|
20 |
-
logger.info(
|
|
|
|
|
|
|
|
|
|
|
21 |
pos_tokens = []
|
22 |
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
23 |
if add_special_token:
|
@@ -28,19 +34,31 @@ def _tokenize(
|
|
28 |
table = []
|
29 |
|
30 |
for idx, token_id in enumerate(encoding):
|
31 |
-
decoded_text = tokenizer.decode(
|
|
|
|
|
32 |
pos_tokens.extend([(decoded_text, str(idx % color_num))])
|
33 |
|
34 |
# token "Byte": # 这是 utf-8编码吧?
|
35 |
-
token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[
|
|
|
|
|
36 |
if isinstance(token, bytes):
|
37 |
try:
|
38 |
token_str = token.decode("utf-8")
|
39 |
except:
|
40 |
token_str = token.decode("utf-8", errors="ignore")
|
41 |
-
logger.error(
|
42 |
-
{
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
token_bytes = token
|
46 |
# json_dumps = json.dumps(token_str)
|
@@ -49,8 +67,12 @@ def _tokenize(
|
|
49 |
token_bytes = bytes(token_str, "utf-8")
|
50 |
# json_dumps = json.dumps(token_str)
|
51 |
else:
|
52 |
-
logger.error(
|
53 |
-
{
|
|
|
|
|
|
|
|
|
54 |
token_str = token
|
55 |
token_bytes = token
|
56 |
# continue
|
@@ -58,13 +80,14 @@ def _tokenize(
|
|
58 |
# ⭐
|
59 |
# TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
|
60 |
table.append(
|
61 |
-
{
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
68 |
)
|
69 |
|
70 |
table_df = pd.DataFrame(table)
|
@@ -73,15 +96,14 @@ def _tokenize(
|
|
73 |
|
74 |
|
75 |
def tokenize(
|
76 |
-
|
77 |
-
tokenizer_name: str,
|
78 |
-
color_num: int = 5,
|
79 |
-
add_special_token: bool = False
|
80 |
):
|
81 |
-
"""
|
82 |
As gr.Update would be overwritten after passing to frontend, we apply lru_cache in _tokenize.
|
83 |
"""
|
84 |
-
pos_tokens, num_tokens, table_df = _tokenize(
|
|
|
|
|
85 |
return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
|
86 |
|
87 |
|
@@ -97,7 +119,7 @@ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
|
97 |
@lru_cache
|
98 |
def basic_count(tokenizer_name):
|
99 |
stats = iter_vocab(tokenizer_name)
|
100 |
-
return stats[
|
101 |
# return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
|
102 |
|
103 |
|
@@ -125,9 +147,14 @@ def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
|
|
125 |
vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
|
126 |
|
127 |
overlap_tokens = vocab_set_1 & vocab_set_2
|
|
|
|
|
|
|
|
|
128 |
overlap_token_size = len(overlap_tokens)
|
129 |
logger.info(
|
130 |
-
f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}"
|
|
|
131 |
return overlap_token_size, overlap_token_size
|
132 |
|
133 |
|
@@ -166,10 +193,16 @@ def on_load(url_params, request: gr.Request):
|
|
166 |
|
167 |
|
168 |
def test_coding():
|
169 |
-
bytes1 = b
|
170 |
print(bytes1) # b'\xe4\xb8\xad'
|
171 |
|
172 |
|
173 |
if __name__ == "__main__":
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
# print(basic_count("internlm_chat_7b"))
|
|
|
6 |
from character_util import iter_vocab
|
7 |
from utils.log_util import logger
|
8 |
from utils.i18n_util import get_lang
|
9 |
+
from playground_examples import (
|
10 |
+
default_tokenizer_name_1,
|
11 |
+
default_tokenizer_name_2,
|
12 |
+
default_user_input,
|
13 |
+
)
|
14 |
from functools import lru_cache
|
15 |
|
16 |
|
17 |
@lru_cache
|
18 |
def _tokenize(
|
19 |
+
text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False
|
|
|
|
|
|
|
20 |
):
|
21 |
+
logger.info(
|
22 |
+
"param="
|
23 |
+
+ json.dumps(
|
24 |
+
{"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False
|
25 |
+
)
|
26 |
+
)
|
27 |
pos_tokens = []
|
28 |
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
29 |
if add_special_token:
|
|
|
34 |
table = []
|
35 |
|
36 |
for idx, token_id in enumerate(encoding):
|
37 |
+
decoded_text = tokenizer.decode(
|
38 |
+
[token_id]
|
39 |
+
) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
40 |
pos_tokens.extend([(decoded_text, str(idx % color_num))])
|
41 |
|
42 |
# token "Byte": # 这是 utf-8编码吧?
|
43 |
+
token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[
|
44 |
+
0
|
45 |
+
]
|
46 |
if isinstance(token, bytes):
|
47 |
try:
|
48 |
token_str = token.decode("utf-8")
|
49 |
except:
|
50 |
token_str = token.decode("utf-8", errors="ignore")
|
51 |
+
logger.error(
|
52 |
+
f"{idx}: decode_error: "
|
53 |
+
+ json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
|
54 |
+
{
|
55 |
+
"tokenizer_type": tokenizer_name,
|
56 |
+
"token": str(token),
|
57 |
+
"token_str": token_str,
|
58 |
+
},
|
59 |
+
ensure_ascii=False,
|
60 |
+
)
|
61 |
+
)
|
62 |
|
63 |
token_bytes = token
|
64 |
# json_dumps = json.dumps(token_str)
|
|
|
67 |
token_bytes = bytes(token_str, "utf-8")
|
68 |
# json_dumps = json.dumps(token_str)
|
69 |
else:
|
70 |
+
logger.error(
|
71 |
+
f"{idx}: wrong type for token {token_id} {type(token)} "
|
72 |
+
+ json.dumps(
|
73 |
+
{"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False
|
74 |
+
)
|
75 |
+
)
|
76 |
token_str = token
|
77 |
token_bytes = token
|
78 |
# continue
|
|
|
80 |
# ⭐
|
81 |
# TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
|
82 |
table.append(
|
83 |
+
{
|
84 |
+
"TokenID": token_id,
|
85 |
+
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
86 |
+
"Text": decoded_text, #
|
87 |
+
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
88 |
+
"UTF8 Bytes": str(token_bytes),
|
89 |
+
# "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
|
90 |
+
}
|
91 |
)
|
92 |
|
93 |
table_df = pd.DataFrame(table)
|
|
|
96 |
|
97 |
|
98 |
def tokenize(
|
99 |
+
text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False
|
|
|
|
|
|
|
100 |
):
|
101 |
+
"""tokenize wrapper
|
102 |
As gr.Update would be overwritten after passing to frontend, we apply lru_cache in _tokenize.
|
103 |
"""
|
104 |
+
pos_tokens, num_tokens, table_df = _tokenize(
|
105 |
+
text, tokenizer_name, color_num, add_special_token
|
106 |
+
)
|
107 |
return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
|
108 |
|
109 |
|
|
|
119 |
@lru_cache
|
120 |
def basic_count(tokenizer_name):
|
121 |
stats = iter_vocab(tokenizer_name)
|
122 |
+
return stats["vocab_size"], f'{stats["organization"]}'
|
123 |
# return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
|
124 |
|
125 |
|
|
|
147 |
vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
|
148 |
|
149 |
overlap_tokens = vocab_set_1 & vocab_set_2
|
150 |
+
# TODO: visualize the add_tokens, del_tokens in Venn diagram
|
151 |
+
# TODO: visualzie the add_tokens, del_tokens in git diff
|
152 |
+
# add_tokens = [token for token in vocab_set_2 if token not in vocab_set_1]
|
153 |
+
# del_tokens = [token for token in vocab_set_1 if token not in vocab_set_2]
|
154 |
overlap_token_size = len(overlap_tokens)
|
155 |
logger.info(
|
156 |
+
f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}"
|
157 |
+
)
|
158 |
return overlap_token_size, overlap_token_size
|
159 |
|
160 |
|
|
|
193 |
|
194 |
|
195 |
def test_coding():
|
196 |
+
bytes1 = b"\xe4\xb8\xad"
|
197 |
print(bytes1) # b'\xe4\xb8\xad'
|
198 |
|
199 |
|
200 |
if __name__ == "__main__":
|
201 |
+
|
202 |
+
# print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
|
203 |
+
print(
|
204 |
+
get_overlap_token_size(
|
205 |
+
"gradientai/Llama-3-8B-Instruct-Gradient-1048k", "deepseek-ai/DeepSeek-R1"
|
206 |
+
)
|
207 |
+
)
|
208 |
# print(basic_count("internlm_chat_7b"))
|
vocab.py
CHANGED
@@ -378,7 +378,7 @@ _all_tokenizer_config = [
|
|
378 |
TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
|
379 |
TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
|
380 |
TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
|
381 |
-
TokenizerConfig("deepseek-ai/DeepSeek-R1", org="DeepSeek"),
|
382 |
TokenizerConfig("deepseek-ai/DeepSeek-R1-Zero", org="DeepSeek"),
|
383 |
TokenizerConfig("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", org="DeepSeek"),
|
384 |
|
|
|
378 |
TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
|
379 |
TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
|
380 |
TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
|
381 |
+
TokenizerConfig("deepseek-ai/DeepSeek-R1", org="DeepSeek"), # 在llama3的词典上,增加了一些中文token,删掉了一部分token
|
382 |
TokenizerConfig("deepseek-ai/DeepSeek-R1-Zero", org="DeepSeek"),
|
383 |
TokenizerConfig("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", org="DeepSeek"),
|
384 |
|