Spaces:

allenai
/

reward-bench

Running

App Files Files Community

natolambert commited on Feb 12, 2024

Commit

ab74236

1 Parent(s): 56fcfaf

updates

Browse files

Files changed (4) hide show

README.md +2 -2
app.py +21 -20
src/md.py +62 -25
src/utils.py +10 -4

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: Rm Benchmark Viewer
-emoji: 😻
 colorFrom: pink
 colorTo: blue
 sdk: gradio

 ---
+title: HERM Leaderboard
+emoji: 📐
 colorFrom: pink
 colorTo: blue
 sdk: gradio

app.py CHANGED Viewed

@@ -4,17 +4,16 @@ from huggingface_hub import HfApi, snapshot_download
 from apscheduler.schedulers.background import BackgroundScheduler
 from datasets import load_dataset
 from src.utils import load_all_data
-from src.md import ABOUT_TEXT
 import numpy as np
 api = HfApi()
 COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
-evals_repo = "ai2-adapt-dev/rm-benchmark-results"
-prefs_repo = "ai2-adapt-dev/rm-testset-results"
 eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev"
 repo_dir_herm = "./evals/herm/"
-repo_dir_prefs = "./evals/prefs/"
 def restart_space():
     api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN)
@@ -29,14 +28,6 @@ repo = snapshot_download(
     repo_type="dataset",
 )
-repo_pref_sets = snapshot_download(
-    local_dir=repo_dir_prefs,
-    repo_id=prefs_repo,
-    use_auth_token=COLLAB_TOKEN,
-    tqdm_class=None,
-    etag_timeout=30,
-    repo_type="dataset",
-)
 def avg_over_herm(dataframe):
     """
@@ -126,10 +117,10 @@ def length_bias_check(dataframe):
-herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False)
 herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
 herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
-prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False)
 # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
 col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
@@ -152,7 +143,7 @@ def random_sample(r: gr.Request, subset):
         sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
         sample = eval_set_filtered[sample_index]
-    markdown_text = '\n\n'.join([f"**{key}**:\n{value}" for key, value in sample.items()])
     return markdown_text
 subsets = eval_set.unique("subset")
@@ -160,38 +151,48 @@ subsets = eval_set.unique("subset")
 with gr.Blocks() as app:
     # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
     with gr.Row():
-        gr.Markdown("# HERM Results Viewer")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("HERM - Overview"):
             with gr.Row():
                 herm_table = gr.Dataframe(
                     herm_data_avg.values,
                     datatype=col_types_herm_avg,
                     headers=herm_data_avg.columns.tolist(),
                     elem_id="herm_dataframe_avg",
                 )
-        with gr.TabItem("HERM - Detailed"):
             with gr.Row():
                 herm_table = gr.Dataframe(
                     herm_data.values,
                     datatype=col_types_herm,
                     headers=herm_data.columns.tolist(),
                     elem_id="herm_dataframe",
                 )
-        with gr.TabItem("HERM - Length Bias"):
             with gr.Row():
                 herm_table = gr.Dataframe(
                     herm_data_length.values,
                     datatype=cols_herm_data_length,
                     headers=herm_data_length.columns.tolist(),
                     elem_id="herm_dataframe_length",
                 )
-        with gr.TabItem("Pref Sets - Overview"):
                 pref_sets_table = gr.Dataframe(
                     prefs_data.values,
                     datatype=col_types_prefs,
                     headers=prefs_data.columns.tolist(),
                     elem_id="prefs_dataframe",
                 )
         with gr.TabItem("About"):

 from apscheduler.schedulers.background import BackgroundScheduler
 from datasets import load_dataset
 from src.utils import load_all_data
+from src.md import ABOUT_TEXT, TOP_TEXT
 import numpy as np
 api = HfApi()
 COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
+evals_repo = "ai2-adapt-dev/HERM-Results"
 eval_set_repo = "ai2-adapt-dev/rm-benchmark-dev"
 repo_dir_herm = "./evals/herm/"
 def restart_space():
     api.restart_space(repo_id="ai2-adapt-dev/rm-benchmark-viewer", token=COLLAB_TOKEN)
     repo_type="dataset",
 )
 def avg_over_herm(dataframe):
     """
+herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False)
 herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
 herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
+prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False)
 # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
 col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
         sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
         sample = eval_set_filtered[sample_index]
+    markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
     return markdown_text
 subsets = eval_set.unique("subset")
 with gr.Blocks() as app:
     # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
     with gr.Row():
+        gr.Markdown(TOP_TEXT)
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("HERM Eval Set - Overview"):
             with gr.Row():
                 herm_table = gr.Dataframe(
                     herm_data_avg.values,
                     datatype=col_types_herm_avg,
                     headers=herm_data_avg.columns.tolist(),
                     elem_id="herm_dataframe_avg",
+                    height=1000,
                 )
+        with gr.TabItem("HERM Eval Set - Detailed"):
             with gr.Row():
                 herm_table = gr.Dataframe(
                     herm_data.values,
                     datatype=col_types_herm,
                     headers=herm_data.columns.tolist(),
                     elem_id="herm_dataframe",
+                    height=1000,
                 )
+        with gr.TabItem("HERM Eval Set - Length Bias"):
             with gr.Row():
                 herm_table = gr.Dataframe(
                     herm_data_length.values,
                     datatype=cols_herm_data_length,
                     headers=herm_data_length.columns.tolist(),
                     elem_id="herm_dataframe_length",
+                    height=1000,
                 )
+        with gr.TabItem("Known Pref. Sets"):
+            with gr.Row():
+                PREF_SET_TEXT = """
+                For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets).
+                """
+                gr.Markdown(PREF_SET_TEXT)
+            with gr.Row():
                 pref_sets_table = gr.Dataframe(
                     prefs_data.values,
                     datatype=col_types_prefs,
                     headers=prefs_data.columns.tolist(),
                     elem_id="prefs_dataframe",
+                    height=1000,
                 )
         with gr.TabItem("About"):

src/md.py CHANGED Viewed

@@ -2,32 +2,69 @@ ABOUT_TEXT = """
 We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
 A win is when the score for the chosen response is higher than the score for the rejected response.
-### Subset summary
-| Subset                 | Num. Samples (Pre-filtering, post-filtering) | Description                                                       |
-| :--------------------- | :------------------------------------------: | :---------------------------------------------------------------- |
-| alpacaeval-easy        |                     805, 100                     | Great model vs poor model                                         |
-| alpacaeval-length      |                     805, 95                     | Good model vs low model, equal length                             |
-| alpacaeval-hard        |                     805, 95                     | Great model vs baseline model                                     |
-| mt-bench-easy          |                  28, 28                    | MT Bench 10s vs 1s                                                |
-| mt-bench-medium        |                  45, 40                    | MT Bench 9s vs 2-5s                                               |
-| mt-bench-hard          |                  45, 37                    | MT Bench 7-8 vs 5-6                                               |
-| refusals-dangerous     |                     505, 100                     | Dangerous response vs no response                                 |
-| refusals-offensive     |                     704, 100                     | Offensive response vs no response                                 |
-| llmbar-natural         |                     100                     | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
-| llmbar-adver-neighbor  |                     134                     | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
-| llmbar-adver-GPTInst   |                     92                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
-| llmbar-adver-GPTOut    |                     47                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
-| llmbar-adver-manual    |                     46                      | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
-| XSTest | 450, 404         | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263))        |
-| do not answer | 939, 136         | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer)        |
-| hep-cpp | 164         | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124))        |
-| hep-go | 164         |   Go code       |
-| hep-java | 164         |  Java code        |
-| hep-js | 164         |    Javascript code        |
-| hep-python | 164         |  Python code         |
-| hep-rust | 164         |   Rust code        |
 For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
 """

 We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
 A win is when the score for the chosen response is higher than the score for the rejected response.
+## Subset Summary
+Total number of the prompts is: 2538, filtered from 4676.
+| Subset             | Num. Samples (Pre-filtering, post-filtering) | Description |
+| :---------- | :-----: | :---------: |
+| alpacaeval-easy    | 805, 100          | Great model vs poor model            |
+| alpacaeval-length    | 805, 95          | Good model vs low model, equal length            |
+| alpacaeval-hard    | 805, 95          | Great model vs baseline model            |
+| mt-bench-easy      | 28, 28           | MT Bench 10s vs 1s            |
+| mt-bench-medium    | 45, 40           | MT Bench 9s vs 2-5s            |
+| mt-bench-hard      | 45, 37          | MT Bench 7-8 vs 5-6            |
+| refusals-dangerous | 505, 100          | Dangerous response vs no response            |
+| refusals-offensive | 704, 100          | Offensive response vs no response            |
+| llmbar-natural     | 100          | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
+| llmbar-adver-neighbor | 134          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
+| llmbar-adver-GPTInst | 92          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
+| llmbar-adver-GPTOut |  47          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
+| llmbar-adver-manual |  46          | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
+| xstest-should-refuse | 450, 250         | False response dataset (see [paper](https://arxiv.org/abs/2308.01263))        |
+| xstest-should-respond | 450, 154         | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263))        |
+| do not answer | 939, 136         | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer)        |
+| hep-cpp | 164         | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124))        |
+| hep-go | 164         |   Go code       |
+| hep-java | 164         |  Java code        |
+| hep-js | 164         |    Javascript code        |
+| hep-python | 164         |  Python code         |
+| hep-rust | 164         |   Rust code        |
+Lengths (mean, std. dev.) include the prompt
+| subset                | length bias | chosen_chars   | rejected_chars   | chosen_tokens   | rejected_tokens   | chosen_unique_tokens   | rejected_unique_tokens   |
+|-----------------------|-------------|----------------|------------------|-----------------|-------------------|------------------------|--------------------------|
+| alpacaeval-easy       | True        | 2283 (1138)    | 646 (482)        | 591 (303)       | 167 (139)         | 253 (117)              | 83 (46)                  |
+| alpacaeval-hard       | True        | 1590 (769)     | 526 (430)        | 412 (199)       | 137 (117)         | 173 (67)               | 71 (48)                  |
+| alpacaeval-length     | Neutral       | 2001 (1137)    | 2127 (1787)      | 511 (283)       | 597 (530)         | 192 (85)               | 189 (99)                 |
+| donotanswer           | False       | 755 (722)      | 1389 (695)       | 170 (161)       | 320 (164)         | 104 (82)               | 157 (73)                 |
+| hep-cpp               | Neutral     | 709 (341)      | 705 (342)        | 261 (125)       | 259 (125)         | 100 (29)               | 99 (29)                  |
+| hep-go                | Neutral     | 738 (361)      | 734 (361)        | 266 (118)       | 265 (118)         | 100 (29)               | 99 (29)                  |
+| hep-java              | Neutral     | 821 (393)      | 814 (390)        | 263 (123)       | 261 (122)         | 102 (30)               | 102 (30)                 |
+| hep-js                | Neutral     | 677 (341)      | 673 (339)        | 251 (129)       | 250 (128)         | 93 (29)                | 93 (29)                  |
+| hep-python            | Neutral     | 618 (301)      | 616 (300)        | 212 (98)        | 211 (98)          | 86 (26)                | 85 (26)                  |
+| hep-rust              | Neutral     | 666 (391)      | 660 (391)        | 221 (132)       | 219 (132)         | 95 (29)                | 95 (29)                  |
+| llmbar-adver-GPTInst  | False       | 735 (578)      | 1623 (1055)      | 170 (135)       | 377 (245)         | 93 (59)                | 179 (106)                |
+| llmbar-adver-GPTOut   | Neutral     | 378 (339)      | 359 (319)        | 96 (81)         | 101 (94)          | 60 (45)                | 55 (41)                  |
+| llmbar-adver-manual   | False       | 666 (584)      | 1139 (866)       | 160 (134)       | 264 (194)         | 92 (63)                | 140 (90)                 |
+| llmbar-adver-neighbor | False       | 287 (297)      | 712 (749)        | 70 (76)         | 173 (175)         | 43 (31)                | 91 (70)                  |
+| llmbar-natural        | Neutral     | 553 (644)      | 530 (597)        | 139 (162)       | 130 (140)         | 75 (71)                | 70 (62)                  |
+| mt-bench-easy         | False       | 1563 (720)     | 2129 (1520)      | 377 (159)       | 551 (415)         | 166 (55)               | 116 (62)                 |
+| mt-bench-hard         | False       | 1225 (499)     | 1471 (1016)      | 284 (116)       | 349 (234)         | 131 (45)               | 136 (58)                 |
+| mt-bench-med          | Neutral       | 1558 (729)     | 1733 (1312)      | 377 (170)       | 410 (311)         | 162 (58)               | 145 (88)                 |
+| refusals-dangerous    | False       | 597 (81)       | 1828 (547)       | 131 (20)        | 459 (136)         | 90 (12)                | 211 (50)                 |
+| refusals-offensive    | False       | 365 (116)      | 1092 (1146)      | 82 (25)         | 299 (278)         | 64 (15)                | 134 (101)                |
+| xstest-should-refuse  | False       | 584 (419)      | 904 (493)        | 129 (89)        | 217 (115)         | 81 (47)                | 116 (53)                 |
+| xstest-should-respond | True        | 771 (420)      | 466 (427)        | 189 (105)       | 107 (94)          | 104 (48)               | 67 (48)                  |
 For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
 """
+TOP_TEXT = """
+# Holistic Evaluation of Reward Models (HERM) from AI2
+Evaluating the capabilities, safety, and pitfalls of reward models.
+[Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
+"""

src/utils.py CHANGED Viewed

@@ -11,9 +11,9 @@ def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 # Define a function to fetch and process data
-def load_all_data(data_repo, subsubsets=False):    # use HF api to pull the git repo
     dir = Path(data_repo)
-    data_dir = dir / "data"
     orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
     # get all files within the sub folders orgs
     models_results = []
@@ -29,7 +29,7 @@ def load_all_data(data_repo, subsubsets=False):    # use HF api to pull the git
     # load all json data in the list models_results one by one to avoid not having the same entries
     for model in models_results:
-        model_data = load_dataset("json", data_files=data_repo + "data/" + model, split="train")
         df2 = pd.DataFrame(model_data)
         # add to df
         df = pd.concat([df2, df])
@@ -63,8 +63,14 @@ def load_all_data(data_repo, subsubsets=False):    # use HF api to pull the git
     cols.insert(1, cols.pop(cols.index('average')))
     df = df.loc[:, cols]
-    # remove columns xstest (outdated data)
     # if xstest is a column
     if "xstest" in df.columns:
         df = df.drop(columns=["xstest"])
     return df

     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 # Define a function to fetch and process data
+def load_all_data(data_repo, subdir:str, subsubsets=False):    # use HF api to pull the git repo
     dir = Path(data_repo)
+    data_dir = dir / subdir
     orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
     # get all files within the sub folders orgs
     models_results = []
     # load all json data in the list models_results one by one to avoid not having the same entries
     for model in models_results:
+        model_data = load_dataset("json", data_files=data_repo + subdir+ "/" + model, split="train")
         df2 = pd.DataFrame(model_data)
         # add to df
         df = pd.concat([df2, df])
     cols.insert(1, cols.pop(cols.index('average')))
     df = df.loc[:, cols]
+    # remove column xstest (outdated data)
     # if xstest is a column
     if "xstest" in df.columns:
         df = df.drop(columns=["xstest"])
+    # remove column anthropic and summarize_prompted (outdated data)
+    if "anthropic" in df.columns:
+        df = df.drop(columns=["anthropic"])
+    if "summarize_prompted" in df.columns:
+        df = df.drop(columns=["summarize_prompted"])
     return df