Spaces:
Running
Running
| import gradio as gr | |
| from utils.compression_util import get_compression_leaderboard | |
| from utils.compression_util import common_corpuses | |
| with gr.Blocks() as demo: | |
| # gr.Markdown("## Convertor") | |
| # with gr.Accordion("Convertor", open=False): | |
| # gr.Markdown("Tokenize {} corpus") | |
| # with gr.Row(elem_classes="no-border"): | |
| # gr.Button("File Size", min_width=50) | |
| # file_size = gr.Textbox( | |
| # show_label=False, | |
| # min_width=50, | |
| # # elem_classes="textbox-as-text" | |
| # ) | |
| # gr.Dropdown( | |
| # choices=['MB', 'GB', 'TB'], | |
| # show_label=False, | |
| # min_width=15, | |
| # # elem_classes="textbox-as-text" | |
| # ) | |
| # # gr.Markdown('<h2 align="center">≈</h2>') | |
| # # gr.HTML('<h2 style="margin: auto;">≈</h2>') | |
| # gr.Button( | |
| # "≈", | |
| # min_width=10, | |
| # elem_classes="button-white h2-font" | |
| # | |
| # ) | |
| # | |
| # gr.Button( | |
| # "Tokens", | |
| # min_width=50 | |
| # ) | |
| # gr.Textbox( | |
| # show_label=False, | |
| # min_width=50 | |
| # ) | |
| # gr.Dropdown( | |
| # ['million', 'billion', 'trillion'], | |
| # show_label=False, | |
| # min_width=15, | |
| # elem_classes="button-white" | |
| # ) | |
| gr.Markdown("## 🛠️ Setting") # ⚙ | |
| with gr.Accordion("Please select corpus and measure of compression rate ...", open=True): | |
| # file size 💽 🖴, tokens 🧮 | |
| # gr.Markdown( | |
| # "Please select corpus and measure of compression rate.\n" | |
| #"`num_of_trillion_tokens` `num_of_billion_tokens`\n" | |
| # "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n" | |
| # "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n" | |
| # "- `n_chars/n_tokens` measures how many chars per token in the current corpus. \n\n" | |
| # "All the above measures are depend on corpus. You can reproduce this " | |
| # "procedure at [github](https://github.com/xu-song/tokenizer-arena/)." | |
| # ) | |
| with gr.Row(): | |
| compress_rate_corpus = gr.Dropdown( | |
| common_corpuses, # , "code" | |
| value=["cc100-en", "cc100-zh-Hans"], | |
| label="corpus", | |
| multiselect=True | |
| # info="" | |
| ) | |
| # unit of file_size: gigabyte terabyte | |
| # unit of token_num: million billion trillion | |
| # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour) | |
| compress_rate_unit = gr.Radio( | |
| ["b_tokens/g_bytes", "t_tokens/t_bytes"], | |
| value="b_tokens/g_bytes", | |
| label="measure", | |
| ) | |
| gr.Markdown( | |
| # "`num_of_trillion_tokens` `num_of_billion_tokens`\n" | |
| "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n" | |
| "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n" | |
| "- `n_chars/n_tokens` measures how many chars per token in the tokenized corpus. \n\n" | |
| "All the above measures are depend on corpus. You can reproduce this " | |
| "procedure at [github](https://github.com/xu-song/tokenizer-arena/)." | |
| ) | |
| gr.Markdown("## 🏆 Compression Rate Leaderboard") | |
| search_bar = gr.Textbox( | |
| placeholder="🔍 Search tokenizers(e.g., 'llama') and press ENTER...", | |
| show_label=False, | |
| elem_id="search-bar", | |
| ) | |
| compress_rate_table = gr.Dataframe() | |
| # func call | |
| compress_rate_corpus.change( | |
| get_compression_leaderboard, | |
| inputs=[compress_rate_corpus, compress_rate_unit], | |
| outputs=compress_rate_table | |
| ) | |
| compress_rate_unit.change( | |
| get_compression_leaderboard, | |
| inputs=[compress_rate_corpus, compress_rate_unit], | |
| outputs=compress_rate_table | |
| ) | |
| # file_size.change( | |
| # get_all_compress_rate, | |
| # outputs=compress_rate_table | |
| # ) | |
| search_bar.submit( | |
| get_compression_leaderboard, | |
| inputs=[ | |
| compress_rate_corpus, | |
| compress_rate_unit, | |
| search_bar, | |
| ], | |
| outputs=compress_rate_table | |
| ) | |
| demo.load( | |
| get_compression_leaderboard, | |
| inputs=[compress_rate_corpus, compress_rate_unit], | |
| outputs=compress_rate_table | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |