Spaces:

Yeyito
/

llm_contamination_detector

Runtime error

App Files Files Community

Yeyito commited on Dec 19, 2023

Commit

c13858c

1 Parent(s): 4996c9a

Trying to stop OOMs on MMLU and GSM8K by halving seq len

Browse files

Files changed (1) hide show

app.py +231 -307

app.py CHANGED Viewed

@@ -1,311 +1,235 @@
-import gradio as gr
-import subprocess
 import os
 import sys
-import time
-import pandas as pd
-from threading import Thread
-# Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
-project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
-src_dir = os.path.join(project_root, "src")
-sys.path.insert(0, src_dir)
-import run as evaluator  # Import the run module
-from src.css_html import custom_css
-from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT, SUBMISSION_TEXT_2
-from src.envs import API, H4_TOKEN, REPO_ID
-from huggingface_hub import HfApi
-from src.utils import (
-    AutoEvalColumn,
-    fields,
-    is_model_on_hub,
-    make_clickable_names,
-    styled_error,
-    styled_message,
-)
-COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
-TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
-COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
-TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
-# CONFIGURATION:
-ref_model = "huggyllama/llama-7b"
-test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
-modelQueue = []
-def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
-    API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
-def save_to_txt(model, results, model_type):
-    file_path = "data/code_eval_board.csv"
-    with open(file_path, "a") as f:
-        f.write(f"\n{model_type},{model}," + str(results["arc"]) + "," + str(results["hellaswag"]) + "," + str(results["mmlu"]) + "," + str(results["truthfulQA"]) + "," + str(results["winogrande"]) + "," + str(results["gsm8k"]))
-        f.close()
-    restart_space()
-def run_test(model,ref_model,data):
-    print(f"|| TESTING {data} ||")
-    return evaluator.main(
-                target_model=f"{model}",
-                ref_model=f"{ref_model}",
-                output_dir="out",
-                data=f"{data}",
-                length=64,
-                key_name="input",
-                ratio_gen=0.4
-            ) # Call the main function in detect-pretrain-code-contamination/src/run.py
-def evaluate(model,model_type):
-    global ref_model
-    print(f"|| EVALUATING {model} ||")
-    results = {
-        "arc": run_test(model, ref_model, test_datasets[2]),
-        "hellaswag": run_test(model, ref_model, test_datasets[4]),
-        "mmlu": run_test(model, ref_model, test_datasets[1]),
-        "truthfulQA": run_test(model, ref_model, test_datasets[0]),
-        "winogrande": run_test(model, ref_model, test_datasets[5]),
-        "gsm8k": run_test(model, ref_model, test_datasets[3]),
-        "ref_model": ref_model,
-    }
-    # Save to .txt file in /Evaluations/{model}
-    save_to_txt(model, results, model_type)
-    return "\n".join([f"{k}:{results[k]}" for k in results])
-def worker_thread():
-    global modelQueue, server
-    while True:
-        for submission in modelQueue:
-            evaluate(submission[0],submission[1].split(" ")[0])
-            modelQueue.pop(modelQueue.index(submission))
-            time.sleep(1)
-        time.sleep(1)
-def queue(model,model_type):
-    global modelQueue
-    modelQueue.append([model,model_type])
-    print(f"QUEUE:\n{modelQueue}")
-### bigcode/bigcode-models-leaderboard
-def add_new_eval(
-    model: str,
-    revision: str,
-    precision: str,
-    model_type: str,
-):
-    precision = precision
-    if model_type is None or model_type == "" or model_type == []:
-        return styled_error("Please select a model type.")
-    print(model_type)
-    # check the model actually exists before adding the eval
-    if revision == "":
-        revision = "main"
-    model_on_hub, error = is_model_on_hub(model, revision)
-    if not model_on_hub:
-        return styled_error(f'Model "{model}" {error}')
-    print("Adding new eval")
-    queue(model,model_type)
-    return styled_message("Your request has been submitted to the evaluation queue!\n")
-def select_columns(df, columns):
-    always_here_cols = [
-        AutoEvalColumn.model_type_symbol.name,
-        AutoEvalColumn.model.name,
-    ]
-    # We use COLS to maintain sorting
-    filtered_df = df[
-        always_here_cols + [c for c in COLS if c in df.columns and c in columns]
-    ]
-    return filtered_df
-def filter_items(df, leaderboard_table, query):
-    if query == "All":
-        return df[leaderboard_table.columns]
     else:
-        query = query[0]  # take only the emoji character
-    filtered_df = df[(df["T"] == query)]
-    return filtered_df[leaderboard_table.columns]
-def search_table(df, leaderboard_table, query):
-    filtered_df = df[(df["Models"].str.contains(query, case=False))]
-    return filtered_df[leaderboard_table.columns]
-demo = gr.Blocks(css=custom_css)
-with demo:
-    with gr.Row():
-        gr.Markdown(
-            """<div style="text-align: center;"><h1> 📄 LLM Contamination Detector </h1></div>\
-            <br>\
-            <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">🤗 Big Code Models Leaderboard ⭐</a>, we use an implementation of <a href="https://huggingface.co/papers/2310.16789">Detecting Pretraining Data from Large Language Models</a> paper found in <a href="https://github.com/swj0419/detect-pretrain-code-contamination/tree/master">this github repo</a>, to provide contamination scores for LLMs on the datasets used by Open LLM Leaderboard.\
-            This space should NOT be used to flag or accuse models of cheating / being contamined, instead, it should form part of a holistic assesment by the parties involved.</p>""",
-            elem_classes="markdown-text",
-        )
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.Column():
-            with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
-                with gr.TabItem("🔍 Evaluations", id=0):
-                    with gr.Column():
-                        with gr.Accordion("➡️ See filters", open=False):
-                            shown_columns = gr.CheckboxGroup(
-                                choices=[
-                                    c
-                                    for c in COLS
-                                    if c
-                                    not in [
-                                        AutoEvalColumn.dummy.name,
-                                        AutoEvalColumn.model.name,
-                                        AutoEvalColumn.model_type_symbol.name,
-                                    ]
-                                ],
-                                value=[
-                                    c
-                                    for c in COLS_LITE
-                                    if c
-                                    not in [
-                                        AutoEvalColumn.dummy.name,
-                                        AutoEvalColumn.model.name,
-                                        AutoEvalColumn.model_type_symbol.name,
-                                    ]
-                                ],
-                                label="",
-                                elem_id="column-select",
-                                interactive=True,
-                            )
-                        # with gr.Column(min_width=780):
-                        with gr.Row():
-                            search_bar = gr.Textbox(
-                                placeholder="🔍 Search for a model and press ENTER...",
-                                show_label=False,
-                                elem_id="search-bar",
-                            )
-                            filter_columns = gr.Radio(
-                                label="⏚ Filter model types",
-                                choices=["All", "🟢 Base", "🔶 Finetuned"],
-                                value="All",
-                                elem_id="filter-columns",
-                            )
-                    df = pd.read_csv("data/code_eval_board.csv")
-                    leaderboard_df = gr.components.Dataframe(
-                        value=df[
-                            [
-                                AutoEvalColumn.model_type_symbol.name,
-                                AutoEvalColumn.model.name,
-                            ]
-                            + shown_columns.value
-                        ],
-                        headers=[
-                            AutoEvalColumn.model_type_symbol.name,
-                            AutoEvalColumn.model.name,
-                        ]
-                        + shown_columns.value,
-                        datatype=TYPES,
-                        elem_id="leaderboard-table",
-                        interactive=False,
-                    )
-                    hidden_leaderboard_df = gr.components.Dataframe(
-                        value=df,
-                        headers=COLS,
-                        datatype=["str" for _ in range(len(COLS))],
-                        visible=False,
-                    )
-                    search_bar.submit(
-                        search_table,
-                        [hidden_leaderboard_df, leaderboard_df, search_bar],
-                        leaderboard_df,
-                    )
-                    filter_columns.change(
-                        filter_items,
-                        [hidden_leaderboard_df, leaderboard_df, filter_columns],
-                        leaderboard_df,
-                    )
-                    shown_columns.change(
-                        select_columns,
-                        [hidden_leaderboard_df, shown_columns],
-                        leaderboard_df,
-                    )
-                    gr.Markdown(
-                        """
-                    **Notes:**
-                    - The Huggingface team is working on their own implementation of this paper as a space, I'll be leaving this space up until that's available.
-                    - Some scores may not be entirely accurate according to the paper cited as I still work out the kinks and innacuracies of this implementation.
-                    - For any issues, questions, or comments either open a discussion in this space's community tab or message me directly to my discord: yeyito777.
-                    - Make sure to check the pinned discussion in this space's community tab for implementation details I'm not 100% about.
-                    """,
-                        elem_classes="markdown-text",
-                    )
-                with gr.TabItem("📝 About", id=2):
-                    gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
-                with gr.TabItem("🛠️ Submit models", id=3):
-                    gr.Markdown(SUBMISSION_TEXT)
-                    gr.Markdown(
-                        "## 📤  Submit a model here:", elem_classes="markdown-text"
-                    )
-                    with gr.Column():
-                        with gr.Row():
-                            model_name = gr.Textbox(label="Model name")
-                            revision_name = gr.Textbox(
-                                label="revision", placeholder="main"
-                            )
-                        with gr.Row():
-                            precision = gr.Dropdown(
-                                choices=[
-                                    "float16",
-                                    "bfloat16",
-                                    "8bit",
-                                    "4bit",
-                                ],
-                                label="Precision",
-                                multiselect=False,
-                                value="float16",
-                                interactive=True,
-                            )
-                            model_type = gr.Dropdown(
-                                choices=["🟢 base", "🔶 instruction-tuned"],
-                                label="Model type",
-                                multiselect=False,
-                                value=None,
-                                interactive=True,
-                            )
-                        submit_button = gr.Button("Submit Eval")
-                        submission_result = gr.Markdown()
-                        submit_button.click(
-                            add_new_eval,
-                            inputs=[model_name, revision_name, precision, model_type],
-                            outputs=[submission_result],
-                        )
-                        gr.Markdown(SUBMISSION_TEXT_2)
-thread = Thread(target=worker_thread)
-thread.start()
-demo.launch(share=True)
-# Some worries:
-# 1. Am I testing things correctly in eval.py, following the template format?
-# 2. Am I choosing the correct splits in run.py? The higherarchy I use is: test > val > train
-#   (As in: if test exists, I go with that, then validation, then default)
-# 3. I decided to go with winogrande_debiased instead of winogrande_l arbitrarily.
-#   (Not sure which one open llm leaderboard uses, or what is the standard)
-# 4. I'm unsure why in eval.py we append the output at the end of the input.
-# 5. Currently I'm using huggyllama/llama-7b as ref_model, should I switch to llama2-7B? Maybe Mistral-7B?

+import logging
+logging.basicConfig(level='ERROR')
+import numpy as np
+from pathlib import Path
+import openai
+import torch
+import zlib
+import statistics
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from tqdm import tqdm
+import math
+import numpy as np
+from datasets import load_dataset
+from options import Options
+from ipdb import set_trace as bp
+from eval import *
+from utils import evaluate_model
+from analyze import analyze_data
+import argparse
 import os
 import sys
+import gc
+import pickle
+models = {}
+def save_data(filename, data):
+    with open(filename, 'wb') as filehandle:
+        # store the data as binary data stream
+        pickle.dump(data, filehandle)
+def load_data(filename):
+    with open(filename, 'rb') as filehandle:
+        # read the data as binary data stream
+        loaded_data = pickle.load(filehandle)
+    return loaded_data
+def unload_model(model,tokenizer):
+    print("[X] Cannot unload model! Functionality not implemented!")
+def load_model(name1):
+    if name1 not in models:
+        model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
+        model1.eval()
+        tokenizer1 = AutoTokenizer.from_pretrained(name1)
+        tokenizer1.pad_token = tokenizer1.eos_token
+        models[name1] = model1
+        models[name1 + "_tokenizer"] = tokenizer1
+    return models[name1], models[name1 + "_tokenizer"]
+def calculatePerplexity(sentence, model, tokenizer, gpu):
+    """
+    exp(loss)
+    """
+    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
+    input_ids = input_ids.to(gpu)
+    with torch.no_grad():
+        outputs = model(input_ids, labels=input_ids)
+    loss, logits = outputs[:2]
+    '''
+    extract logits:
+    '''
+    # Apply softmax to the logits to get probabilities
+    probabilities = torch.nn.functional.log_softmax(logits, dim=-1)
+    # probabilities = torch.nn.functional.softmax(logits, dim=-1)
+    all_prob = []
+    input_ids_processed = input_ids[0][1:]
+    for i, token_id in enumerate(input_ids_processed):
+        probability = probabilities[0, i, token_id].item()
+        all_prob.append(probability)
+    return torch.exp(loss).item(), all_prob, loss.item()
+def sample_generation(sentence, model, tokenizer, args,data_name):
+    half_sentence_index = math.ceil(len(sentence.split())*args['prefix_length'])
+    if half_sentence_index > 0:
+        prefix = " ".join(sentence.split()[:half_sentence_index])
     else:
+        prefix = '<|startoftext|> '
+    input_ids = torch.tensor(tokenizer.encode(prefix)).unsqueeze(0)
+    input_ids = input_ids.to(model.device)
+    output = None
+    if data_name != "cais/mmlu" or data_name != "gsm8k":
+        output = model.generate(input_ids, max_new_tokens=len(sentence.split())-half_sentence_index, min_new_tokens=1, num_return_sequences=args['num_z'], pad_token_id=tokenizer.eos_token_id, **args['generate_args'])
+    else:
+        output = model.generate(input_ids, max_new_tokens=(len(sentence.split())-half_sentence_index)/2, min_new_tokens=1, num_return_sequences=args['num_z'], pad_token_id=tokenizer.eos_token_id, **args['generate_args'])
+    # print(output)
+    complete_generated_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+    return complete_generated_text
+def RMIA_1(text,target_loss,ref_loss,model1,tokenizer1,ratio_gen,neighbors_dl):
+    target_losses_z = evaluate_model(model1,tokenizer1,neighbors_dl)
+    result = torch.count_nonzero(target_losses_z < target_loss).item() / len(target_losses_z)
+    return result
+def get_neighbors(text,ref_loss,model2,tokenizer2,ratio_gen,data_name):
+    cur_args = {'prefix_length': ratio_gen, 'num_z': 100, 'generate_args': {'do_sample': True}}
+    neighbors = sample_generation(text, model2, tokenizer2, cur_args,data_name)
+    neighbors_dl = DataLoader(neighbors, batch_size=32, shuffle=False)
+    return neighbors_dl
+def evaluate_data(test_data, col_name, target_model, ref_model, ratio_gen, data_name):
+    global model1,model2,tokenizer1,tokenizer2
+    print(f"all data size: {len(test_data)}")
+    random.seed(0)
+    random.shuffle(test_data)
+    test_data = test_data[:100]
+    inference2_pass = None
+    neighbors_dls = None
+    ref_model_clean = ref_model.replace("/","-")
+    data_name_clean = data_name.replace("/","-")
+    os.makedirs(os.path.join(f"saves/{ref_model_clean}",f"{data_name_clean}"),exist_ok=True)
+    try:
+        inference2_pass = load_data(f'saves/{ref_model_clean}/{data_name_clean}/inference2_pass.txt')
+        neighbors_dls = load_data(f'saves/{ref_model_clean}/{data_name_clean}/neighbors_dls.txt')
+    except:
+        ### MODEL 2 likelihoods
+        model2, tokenizer2 = load_model(ref_model)
+        inference2_pass = [] #0: p_ref, #1: all_prob_ref, #2: p_ref_likelihood
+        for ex in tqdm(test_data):
+            text = ex[col_name]
+            new_ex = inference_model2(model2, tokenizer2, text)
+            inference2_pass.append(new_ex)
+        # Invariant. Doesn't take in model1 so I'm good
+        ### Neighbors:
+        neighbors_dls = []
+        counter = 0
+        for ex in tqdm(test_data):
+            text = ex[col_name]
+            new_ex = get_neighbors(text,inference2_pass[counter][2],model2,tokenizer2,ratio_gen,data_name)
+            counter = counter + 1
+            neighbors_dls.append(new_ex)
+        unload_model(model2,tokenizer2)
+        # Because it uses temp it is not invariant, however taking a snapshot in time should be just fine.
+        save_data(f'saves/{ref_model_clean}/{data_name_clean}/inference2_pass.txt',inference2_pass)
+        save_data(f'saves/{ref_model_clean}/{data_name_clean}/neighbors_dls.txt',neighbors_dls)
+        print("Saved ref data, exiting.")
+    ### MODEL 1 likelihoods
+    model1, tokenizer1 = load_model(target_model)
+    inference1_pass = [] #0: p1, #1: all_prob, #2: p1_likelihood, #3: p_lower, #4: p_lower_likelihood
+    for ex in tqdm(test_data):
+        text = ex[col_name]
+        new_ex = inference_model1(model1,tokenizer1,text)
+        inference1_pass.append(new_ex)
+    ### RIMA results
+    model1, tokenizer1 = load_model(target_model)
+    counter = 0
+    results = []
+    for ex in tqdm(test_data):
+        text = ex[col_name]
+        new_ex = RMIA_1(text,inference1_pass[counter][2],inference2_pass[counter][2],model1,tokenizer1,ratio_gen,neighbors_dls[counter])
+        counter = counter + 1
+        results.append(new_ex)
+    unload_model(model1,tokenizer1)
+    ### Inference ex
+    all_output = []
+    counter = 0
+    for ex in tqdm(test_data):
+        text = ex[col_name]
+        pred = {}
+        pred["minkprob_w/_ref"] = results[counter]
+        pred["ppl"] = inference1_pass[counter][0]
+        pred["ppl/Ref_ppl (calibrate PPL to the reference model)"] = inference1_pass[counter][2]-inference2_pass[counter][2]
+        pred["ppl/lowercase_ppl"] = -(np.log(inference1_pass[counter][3]) / np.log(inference1_pass[counter][0])).item()
+        zlib_entropy = len(zlib.compress(bytes(text, 'utf-8')))
+        pred["ppl/zlib"] = np.log(inference1_pass[counter][0])/zlib_entropy
+        ex["pred"] = pred
+        counter = counter + 1
+        all_output.append(ex)
+    return all_output
+def inference_model1 (model1, tokenizer1, text):
+    p1, all_prob, p1_likelihood = calculatePerplexity(text, model1, tokenizer1, gpu=model1.device)
+    p_lower, _, p_lower_likelihood = calculatePerplexity(text.lower(), model1, tokenizer1, gpu=model1.device)
+    return [p1, all_prob, p1_likelihood, p_lower, p_lower_likelihood]
+def inference_model2 (model2, tokenizer2, text):
+    p_ref, all_prob_ref, p_ref_likelihood = calculatePerplexity(text, model2, tokenizer2, gpu=model2.device)
+    return [p_ref,all_prob_ref,p_ref_likelihood]
+def main(target_model,ref_model,output_dir,data,length,key_name,ratio_gen):
+    output_dir = f"{output_dir}/{target_model}_{ref_model}/{key_name}"
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    # load model and data
+    data_name = data
+    if "jsonl" in data:
+        data = load_jsonl(f"{data}")
+    elif data == "truthful_qa":
+        # bp()
+        dataset = load_dataset(data, "multiple_choice", split="validation")
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_truthful_qa(data)
+    elif data == "cais/mmlu":
+        dataset = load_dataset(data, "all", split="test")
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_mmlu(data)
+    elif data == "ai2_arc":
+        dataset = load_dataset(data, "ARC-Challenge", split="test")
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_arc(data)
+    elif data == "gsm8k":
+        dataset = load_dataset(data, "main", split="test")
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_gsm8k(data)
+    elif data == "Rowan/hellaswag":
+        dataset = load_dataset(data, "default", split="validation")
+        # We use validation since labels for the test set are not available?
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_hellaswag(data)
+    elif data == "winogrande":
+        dataset = load_dataset(data,"winogrande_debiased", split="validation")
+        data = convert_huggingface_data_to_list_dic(dataset)
+        data = process_winogrande(data)
+    #model1, model2, tokenizer1, tokenizer2 = load_model(target_model, ref_model)
+    all_output = evaluate_data(data,key_name, target_model, ref_model,ratio_gen,data_name)
+    dump_jsonl(all_output, f"{output_dir}/all_output.jsonl")
+    return analyze_data(all_output)
+    # fig_fpr_tpr(all_output, output_dir)