Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import subprocess | |
| import os | |
| import sys | |
| import time | |
| import pandas as pd | |
| from threading import Thread | |
| import numpy as np | |
| # Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path | |
| project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination")) | |
| src_dir = os.path.join(project_root, "src") | |
| sys.path.insert(0, src_dir) | |
| import run as evaluator # Import the run module | |
| from src.css_html import custom_css | |
| from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT, SUBMISSION_TEXT_2 | |
| from src.envs import API, H4_TOKEN, REPO_ID | |
| from huggingface_hub import HfApi | |
| from src.utils import ( | |
| AutoEvalColumn, | |
| fields, | |
| is_model_on_hub, | |
| make_clickable_names, | |
| styled_error, | |
| styled_message, | |
| EVAL_COLS, | |
| EVAL_TYPES | |
| ) | |
| COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] | |
| TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden] | |
| COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] | |
| TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] | |
| # CONFIGURATION: | |
| test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"] | |
| modelQueue = (pd.read_csv('data/queue.csv')).values.tolist() | |
| print(modelQueue) | |
| def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way. | |
| API.restart_space(repo_id=REPO_ID, token=H4_TOKEN) | |
| def formatr(result): | |
| result = str(result) | |
| result = result.split(",")[2].replace(")","") | |
| result = result.replace(" ","") | |
| return result | |
| def save_to_txt(model, results, model_type,ref_model): | |
| file_path = "data/code_eval_board.csv" | |
| with open(file_path, "a") as f: | |
| f.write(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}") | |
| print(f"Finished evaluation of model: {model} using ref_model: {ref_model}") | |
| print(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}") | |
| f.close() | |
| def run_test(model,ref_model,data): | |
| print(f"|| TESTING {data} ||") | |
| return evaluator.main( | |
| target_model=f"{model}", | |
| ref_model=f"{ref_model}", | |
| output_dir="out", | |
| data=f"{data}", | |
| length=64, | |
| key_name="input", | |
| ratio_gen=0.4 | |
| ) # Call the main function in detect-pretrain-code-contamination/src/run.py | |
| def evaluate(model,model_type,ref_model): | |
| print(f"|| EVALUATING {model} ||") | |
| results = { | |
| "arc": run_test(model, ref_model, test_datasets[2]), | |
| "hellaswag": run_test(model, ref_model, test_datasets[4]), | |
| "mmlu": run_test(model, ref_model, test_datasets[1]), | |
| "truthfulQA": run_test(model, ref_model, test_datasets[0]), | |
| "winogrande": run_test(model, ref_model, test_datasets[5]), | |
| "gsm8k": run_test(model, ref_model, test_datasets[3]), | |
| "ref_model": ref_model, | |
| } | |
| # Save to .txt file in /Evaluations/{model} | |
| save_to_txt(model, results, model_type,ref_model) | |
| return "\n".join([f"{k}:{results[k]}" for k in results]) | |
| def worker_thread(): | |
| global modelQueue, server | |
| while True: | |
| for submission in modelQueue: | |
| #evaluate(submission[1],submission[0].split(" ")[0],submission[2]) | |
| #modelQueue.pop(modelQueue.index(submission)) | |
| #exit() | |
| #The exit above is temporal while I figure out how to unload a model from a thread or similar. | |
| # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back. | |
| # I highly encourage you to try to reproduce the results I get using your own implementation. | |
| # Do NOT take anything listed here as fact, as I'm not 100% my implementation works as intended. | |
| # Take whatever you see in the leaderboard as a grain of salt, do NOT accuse models of cheating just because of their placement here alone. | |
| time.sleep(1) | |
| time.sleep(1) | |
| def queue(model,model_type,ref_model): | |
| global modelQueue | |
| modelQueue.append([model_type,model,ref_model]) | |
| file_path = "data/queue.csv" | |
| with open(file_path, "a") as f: | |
| model = model.strip() | |
| ref_model = ref_model.strip() | |
| f.write(f"\n{model_type},{model},{ref_model}") | |
| f.close() | |
| print(f"QUEUE:\n{modelQueue}") | |
| ### bigcode/bigcode-models-leaderboard | |
| def add_new_eval( | |
| model: str, | |
| revision: str, | |
| ref_model: str, | |
| model_type: str, | |
| ): | |
| ref_model = ref_model | |
| if model_type is None or model_type == "" or model_type == []: | |
| return styled_error("Please select a model type.") | |
| print(model_type) | |
| # check the model actually exists before adding the eval | |
| if revision == "": | |
| revision = "main" | |
| model_on_hub, error = is_model_on_hub(model, revision) | |
| if not model_on_hub: | |
| return styled_error(f'Model "{model}" {error}') | |
| print("Adding new eval") | |
| queue(model,model_type,ref_model) | |
| return styled_message("Your request has been submitted to the evaluation queue!\n") | |
| def select_columns(df, columns): | |
| always_here_cols = [ | |
| AutoEvalColumn.model_type_symbol.name, | |
| AutoEvalColumn.model.name, | |
| ] | |
| # We use COLS to maintain sorting | |
| filtered_df = df[ | |
| always_here_cols + [c for c in COLS if c in df.columns and c in columns] | |
| ] | |
| return filtered_df | |
| def filter_items(df, leaderboard_table, query): | |
| if query == "All": | |
| return df[leaderboard_table.columns] | |
| else: | |
| query = query[0] # take only the emoji character | |
| filtered_df = df[(df["T"] == query)] | |
| return filtered_df[leaderboard_table.columns] | |
| def search_table(df, leaderboard_table, query): | |
| filtered_df = df[(df["Models"].str.contains(query, case=False))] | |
| return filtered_df[leaderboard_table.columns] | |
| demo = gr.Blocks(css=custom_css) | |
| with demo: | |
| with gr.Row(): | |
| gr.Markdown( | |
| """<div style="text-align: center;"><h1> π LLM Contamination Detector </h1></div>\ | |
| <br>\ | |
| <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">π€ Big Code Models Leaderboard β</a>, we use an implementation of <a href="https://huggingface.co/papers/2310.16789">Detecting Pretraining Data from Large Language Models</a> paper found in <a href="https://github.com/swj0419/detect-pretrain-code-contamination/tree/master">this github repo</a>, to provide contamination scores for LLMs on the datasets used by Open LLM Leaderboard.\ | |
| This space should NOT be used to flag or accuse models of cheating / being contamined, instead, it should form part of a holistic assesment by the parties involved.</p>""", | |
| elem_classes="markdown-text", | |
| ) | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.Column(): | |
| with gr.Tabs(elem_classes="A100-tabs") as A100_tabs: | |
| with gr.TabItem("π Evaluations", id=0): | |
| with gr.Column(): | |
| with gr.Accordion("β‘οΈ See filters", open=False): | |
| shown_columns = gr.CheckboxGroup( | |
| choices=[ | |
| c | |
| for c in COLS | |
| if c | |
| not in [ | |
| AutoEvalColumn.dummy.name, | |
| AutoEvalColumn.model.name, | |
| AutoEvalColumn.model_type_symbol.name, | |
| ] | |
| ], | |
| value=[ | |
| c | |
| for c in COLS_LITE | |
| if c | |
| not in [ | |
| AutoEvalColumn.dummy.name, | |
| AutoEvalColumn.model.name, | |
| AutoEvalColumn.model_type_symbol.name, | |
| ] | |
| ], | |
| label="", | |
| elem_id="column-select", | |
| interactive=True, | |
| ) | |
| # with gr.Column(min_width=780): | |
| with gr.Row(): | |
| search_bar = gr.Textbox( | |
| placeholder="π Search for a model and press ENTER...", | |
| show_label=False, | |
| elem_id="search-bar", | |
| ) | |
| filter_columns = gr.Radio( | |
| label="β Filter model types", | |
| choices=["All", "π’ Base", "πΆ Finetuned"], | |
| value="All", | |
| elem_id="filter-columns", | |
| ) | |
| df = pd.read_csv("data/code_eval_board.csv") | |
| leaderboard_df = gr.components.Dataframe( | |
| value=df[ | |
| [ | |
| AutoEvalColumn.model_type_symbol.name, | |
| AutoEvalColumn.model.name, | |
| ] | |
| + shown_columns.value | |
| ], | |
| headers=[ | |
| AutoEvalColumn.model_type_symbol.name, | |
| AutoEvalColumn.model.name, | |
| ] | |
| + shown_columns.value, | |
| datatype=TYPES, | |
| elem_id="leaderboard-table", | |
| interactive=False, | |
| ) | |
| hidden_leaderboard_df = gr.components.Dataframe( | |
| value=df, | |
| headers=COLS, | |
| datatype=["str" for _ in range(len(COLS))], | |
| visible=False, | |
| ) | |
| search_bar.submit( | |
| search_table, | |
| [hidden_leaderboard_df, leaderboard_df, search_bar], | |
| leaderboard_df, | |
| ) | |
| filter_columns.change( | |
| filter_items, | |
| [hidden_leaderboard_df, leaderboard_df, filter_columns], | |
| leaderboard_df, | |
| ) | |
| shown_columns.change( | |
| select_columns, | |
| [hidden_leaderboard_df, shown_columns], | |
| leaderboard_df, | |
| ) | |
| gr.Markdown( | |
| """ | |
| **Notes:** | |
| - The Huggingface team is working on their own implementation of this paper as a space, I'll be leaving this space up until that's available. | |
| - Some scores may not be entirely accurate according to the paper cited as I still work out the kinks and innacuracies of this implementation. | |
| - For any issues, questions, or comments either open a discussion in this space's community tab or message me directly to my discord: yeyito777. | |
| - Make sure to check the pinned discussion in this space's community tab for implementation details I'm not 100% about. | |
| """, | |
| elem_classes="markdown-text", | |
| ) | |
| with gr.TabItem("π About", id=2): | |
| gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") | |
| with gr.TabItem("π οΈ Submit models", id=3): | |
| gr.Markdown(SUBMISSION_TEXT) | |
| gr.Markdown( | |
| "## π€ Submit a model here:", elem_classes="markdown-text" | |
| ) | |
| with gr.Column(): | |
| with gr.Column(): | |
| with gr.Accordion( | |
| f"β³ Evaluation Queue ({len(modelQueue)})", | |
| open=False, | |
| ): | |
| with gr.Row(): | |
| finished_eval_table = gr.components.Dataframe( | |
| value=pd.DataFrame(modelQueue, columns=['Type','Model','Reference Model']), | |
| ) | |
| with gr.Row(): | |
| model_name = gr.Textbox(label="Model name") | |
| revision_name = gr.Textbox( | |
| label="revision", placeholder="main" | |
| ) | |
| with gr.Row(): | |
| ref_model = gr.Dropdown( | |
| choices=[ | |
| "mistralai/Mistral-7B-v0.1", | |
| "huggyllama/llama-7b", | |
| "NousResearch/Llama-2-7b-hf", | |
| "upstage/SOLAR-10.7B-v1.0", | |
| ], | |
| label="Reference Model", | |
| multiselect=False, | |
| value="mistralai/Mistral-7B-v0.1", | |
| interactive=True, | |
| ) | |
| model_type = gr.Dropdown( | |
| choices=["π’ base", "πΆ finetuned"], | |
| label="Model type", | |
| multiselect=False, | |
| value=None, | |
| interactive=True, | |
| ) | |
| submit_button = gr.Button("Submit Eval") | |
| submission_result = gr.Markdown() | |
| submit_button.click( | |
| add_new_eval, | |
| inputs=[model_name, revision_name, ref_model, model_type], | |
| outputs=[submission_result], | |
| ) | |
| gr.Markdown(SUBMISSION_TEXT_2) | |
| thread = Thread(target=worker_thread) | |
| thread.start() | |
| demo.launch(share=True) | |
| # Some worries: | |
| # 1. Am I testing things correctly in eval.py, following the template format? | |
| # 2. Am I choosing the correct splits in run.py? The higherarchy I use is: test > val > train | |
| # (As in: if test exists, I go with that, then validation, then default) | |
| # 3. I decided to go with winogrande_debiased instead of winogrande_l arbitrarily. | |
| # (Not sure which one open llm leaderboard uses, or what is the standard) | |
| # 4. I'm unsure why in eval.py we append the output at the end of the input. | |
| # 5. Currently I'm using huggyllama/llama-7b as ref_model, should I switch to llama2-7B? Maybe Mistral-7B? | |