|
from typing import List |
|
import gradio as gr |
|
from transformers import AutoTokenizer |
|
import tiktoken |
|
|
|
|
|
hf_tokenizer_list = [ |
|
("tugstugi/bert-large-mongolian-cased", False), |
|
("tugstugi/bert-large-mongolian-uncased", False), |
|
("bayartsogt/mongolian-roberta-large", True), |
|
("meta-llama/Llama-2-13b-hf", True), |
|
("bigscience/bloom", True), |
|
] |
|
|
|
openai_tokenizer_list = [ |
|
"text-davinci-003", |
|
"gpt-4" |
|
] |
|
|
|
|
|
hf_tokenizers = [ |
|
AutoTokenizer.from_pretrained(model_name_or_id, use_fast=use_fast, trust_remote_code=True) |
|
for model_name_or_id, use_fast in hf_tokenizer_list |
|
] |
|
|
|
openai_tokenizers = [ |
|
tiktoken.encoding_for_model(name) |
|
for name in openai_tokenizer_list |
|
] |
|
|
|
|
|
def do_tokenize(tokenizer: AutoTokenizer, text: str) -> List[str]: |
|
return [(tokenizer.decode([token_id]), str(i)) for i, token_id in enumerate(tokenizer.encode(text))] |
|
|
|
def do_simple_split(text: str): |
|
return [(x, str(i)) for i, x in enumerate(text.split())] |
|
|
|
def do_function(text: str): |
|
return ( |
|
text, |
|
len(text), |
|
do_simple_split(text), |
|
*[do_tokenize(tokenizer, text) for tokenizer in hf_tokenizers], |
|
*[do_tokenize(tokenizer, text) for tokenizer in openai_tokenizers], |
|
) |
|
|
|
|
|
demo = gr.Interface( |
|
do_function, |
|
[ |
|
gr.Text("", placeholder="Мөнгөө тушаачихсаныхаа дараа мэдэгдээрэй") |
|
], |
|
[ |
|
gr.Text("", label="input"), |
|
gr.Number(0, label="Character Count"), |
|
gr.HighlightedText("", label="Simple Split"), |
|
*[gr.HighlightedText("", label=tokenizer_name) for tokenizer_name, _ in hf_tokenizer_list], |
|
*[gr.HighlightedText("", label="openai/" + tokenizer_name) for tokenizer_name in openai_tokenizer_list], |
|
], |
|
live=True, |
|
allow_flagging="never", |
|
title="Real-Time Tokenizer", |
|
description=( |
|
"**Tokenizers:**\n" + |
|
"\n".join( |
|
[ |
|
f"🤗 [{x}](https://huggingface.co/{x})" |
|
for x, _ in hf_tokenizer_list |
|
] + [ |
|
f"⏳ [{x}](https://github.com/openai/tiktoken)" |
|
for x in openai_tokenizer_list |
|
]) |
|
), |
|
) |
|
if __name__ == "__main__": |
|
demo.launch() |
|
|