open_dutch_llm_leaderboard

Running

App Files Files Community

Bram Vanroy commited on Nov 26, 2023

Commit

5693ee5

•

1 Parent(s): 0ae29b2

revision for Dutch only

Browse files

Files changed (30) hide show

.gitignore +107 -126
app.py +58 -91
content.py +15 -23
css.py +0 -13
evals/arc/arc_nl_Llama-2-7b-chat-hf.json +6 -6
evals/arc/arc_nl_Llama-2-7b-hf.json +6 -6
evals/arc/{arc_nl_Mistral-7B-v0.1.json → arc_nl_Orca-2-7b.json} +6 -6
evals/{truthfulqa/truthfulqa_nl-Llama-2-13b-hf.json → arc/arc_nl_gpt2-large-dutch.json} +8 -8
evals/arc/arc_nl_gpt2-medium-dutch.json +23 -0
evals/arc/arc_nl_zephyr-7b-beta.json +6 -6
evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json +6 -6
evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json +6 -6
evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json +6 -6
evals/hellaswag/{hellaswag_nl_zephyr-7b-beta.json → hellaswag_nl_Orca-2-7b.json} +6 -6
evals/hellaswag/hellaswag_nl_gpt2-large-dutch.json +23 -0
evals/hellaswag/hellaswag_nl_gpt2-medium-dutch.json +23 -0
evals/hellaswag/hellaswag_nl_neural-chat-7b-v3-1.json +23 -0
evals/{truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json → mmlu/mmlu_nl_Mistral-7B-v0.1.json} +8 -8
evals/mmlu/mmlu_nl_gpt2-large-dutch.json +23 -0
evals/mmlu/mmlu_nl_gpt2-medium-dutch.json +23 -0
evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json +0 -23
evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json +6 -6
evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json +4 -4
evals/truthfulqa/{truthfulqa_nl-falcon-40b-ft-alpaca-dolly-dutch.json → truthfulqa_nl_Orca-2-7b.json} +6 -6
evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json +0 -23
evals/truthfulqa/truthfulqa_nl_falcon-40b.json +0 -23
evals/truthfulqa/{truthfulqa_nl-llama2-13b-ft-mc4_nl_cleaned_tiny.json → truthfulqa_nl_gpt2-large-dutch.json} +6 -6
evals/truthfulqa/{truthfulqa_nl-falcon-40b.json → truthfulqa_nl_gpt2-medium-dutch.json} +6 -6
evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json +0 -23
evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json +0 -23

.gitignore CHANGED Viewed

@@ -1,92 +1,42 @@
-*.txt
-!src/**/*.txt
-runs*
-wandb*
-Pipfile*
-data/*
-muss
-models/*
-*config.json
-# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
-# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
-.idea/
-# User-specific stuff
-.idea/**/workspace.xml
-.idea/**/tasks.xml
-.idea/**/usage.statistics.xml
-.idea/**/dictionaries
-.idea/**/shelf
-# AWS User-specific
-.idea/**/aws.xml
-# Generated files
-.idea/**/contentModel.xml
-# Sensitive or high-churn files
-.idea/**/dataSources/
-.idea/**/dataSources.ids
-.idea/**/dataSources.local.xml
-.idea/**/sqlDataSources.xml
-.idea/**/dynamic.xml
-.idea/**/uiDesigner.xml
-.idea/**/dbnavigator.xml
-# Gradle
-.idea/**/gradle.xml
-.idea/**/libraries
-# Gradle and Maven with auto-import
-# When using Gradle or Maven with auto-import, you should exclude module files,
-# since they will be recreated, and may cause churn.  Uncomment if using
-# auto-import.
-# .idea/artifacts
-# .idea/compiler.xml
-# .idea/jarRepositories.xml
-# .idea/modules.xml
-# .idea/*.iml
-# .idea/modules
-# *.iml
-# *.ipr
-# CMake
-cmake-build-*/
-# Mongo Explorer plugin
-.idea/**/mongoSettings.xml
-# File-based project format
-*.iws
-# IntelliJ
-out/
-# mpeltonen/sbt-idea plugin
-.idea_modules/
-# JIRA plugin
-atlassian-ide-plugin.xml
-# Cursive Clojure plugin
-.idea/replstate.xml
-# SonarLint plugin
-.idea/sonarlint/
-# Crashlytics plugin (for Android Studio and IntelliJ)
-com_crashlytics_export_strings.xml
-crashlytics.properties
-crashlytics-build.properties
-fabric.properties
-# Editor-based Rest Client
-.idea/httpRequests
-# Android studio 3.1+ serialized cache file
-.idea/caches/build_file_checksums.ser
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -110,7 +60,6 @@ parts/
 sdist/
 var/
 wheels/
-share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
@@ -129,17 +78,14 @@ pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
-.nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
-*.py,cover
 .hypothesis/
 .pytest_cache/
-cover/
 # Translations
 *.mo
@@ -149,7 +95,6 @@ cover/
 *.log
 local_settings.py
 db.sqlite3
-db.sqlite3-journal
 # Flask stuff:
 instance/
@@ -162,41 +107,16 @@ instance/
 docs/_build/
 # PyBuilder
-.pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
 # pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-# Celery stuff
 celerybeat-schedule
-celerybeat.pid
 # SageMath parsed files
 *.sage.py
@@ -222,21 +142,82 @@ venv.bak/
 # mypy
 .mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/

+run-backend.ps
+.eslintrc.js
+.venv
+# ignore compiled styles
+*.css
+# dependencies
+**/node_modules/
+**/.pnp
+*.pnp.js
+# testing
+/coverage
+# VSCode
+**/.vscode/
+# production
+**/build/
+# misc
+.DS_Store
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+# python
+data/
+Pipfile*
+# .idea (JetBrains)
+**/.idea/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.log
 local_settings.py
 db.sqlite3
 # Flask stuff:
 instance/
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # pyenv
+.python-version
+# celery beat schedule file
 celerybeat-schedule
 # SageMath parsed files
 *.sage.py
 # mypy
 .mypy_cache/
+test.py
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# AWS User-specific
+.idea/**/aws.xml
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# SonarLint plugin
+.idea/sonarlint/
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties``
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser

app.py CHANGED Viewed

@@ -2,12 +2,13 @@ import json
 from collections import defaultdict
 from pathlib import Path
 import pandas as pd
 import gradio as gr
 from content import *
-from css import *
-import glob
 ARC = "arc"
 HELLASWAG = "hellaswag"
@@ -17,51 +18,17 @@ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
 METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
-LANGS = "ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh".split(",")
-LANG_NAME = {
-    "ar": "Arabic",
-    "bn": "Bengali",
-    "ca": "Catalan",
-    "da": "Danish",
-    "de": "German",
-    "es": "Spanish",
-    "eu": "Basque",
-    "fr": "French",
-    "gu": "Gujarati",
-    "hi": "Hindi",
-    "hr": "Croatian",
-    "hu": "Hungarian",
-    "hy": "Armenian",
-    "id": "Indonesian",
-    "it": "Italian",
-    "kn": "Kannada",
-    "ml": "Malayalam",
-    "mr": "Marathi",
-    "ne": "Nepali",
-    "nl": "Dutch",
-    "pt": "Portuguese",
-    "ro": "Romanian",
-    "ru": "Russian",
-    "sk": "Slovak",
-    "sr": "Serbian",
-    "sv": "Swedish",
-    "ta": "Tamil",
-    "te": "Telugu",
-    "uk": "Ukrainian",
-    "vi": "Vietnamese",
-    "zh": "Chinese",
-}
-def collect_results():
     performance_dict = defaultdict(dict)
-    pretrained_models = set()
     for pfin in Path("evals").rglob("*.json"):
         data = json.loads(pfin.read_text(encoding="utf-8"))
-        if "results" not in data:
-            continue
-        if "config" not in data:
             continue
         results = data["results"]
         config = data["config"]
@@ -74,7 +41,6 @@ def collect_results():
             continue
         pretrained = pretrained[0].split("=")[1]
         pretrained = pretrained.split("/")[-1]
-        pretrained_models.add(pretrained)
         for lang_task, perfs in results.items():
             task, lang = lang_task.split("_")
@@ -85,33 +51,46 @@ def collect_results():
                 p = round(perfs[metric] * 100, 1)
                 performance_dict[(pretrained, lang)][task] = p
-    return performance_dict, pretrained_models
-def get_leaderboard_df(performance_dict, pretrained_models):
-    df = list()
     for (pretrained, lang), perfs in performance_dict.items():
-        lang_name = LANG_NAME[lang]
         arc_perf = perfs.get(ARC, 0.0)
         hellaswag_perf = perfs.get(HELLASWAG, 0.0)
         mmlu_perf = perfs.get(MMLU, 0.0)
         truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
         avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
-        notes = " ".join([pretrained, lang_name])
-        row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
-        df.append(row)
-    df = pd.DataFrame.from_records(df, columns=COLS)
     df = df.sort_values(by=[AVERAGE_COL], ascending=False)
-    df = df[COLS]
     return df
-def search_table(df, query):
-    filtered_df = df[df[NOTES_COL].str.contains(query, case=False)]
-    return filtered_df
 MODEL_COL = "Model"
@@ -120,43 +99,31 @@ ARC_COL = "ARC (25-shot)"
 HELLASWAG_COL = "HellaSwag (10-shot)️"
 MMLU_COL = "MMLU (5-shot)"
 TRUTHFULQA_COL = "TruthfulQA (0-shot)"
-NOTES_COL = "Notes"  # For search only
-COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
-TYPES = ["str", "number", "number", "number", "number", "number", "str"]
-args = collect_results()
-original_df = get_leaderboard_df(*args)
-demo = gr.Blocks(css=CUSTOM_CSS)
-with demo:
     gr.HTML(TITLE)
-    gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
-    gr.Markdown(HOW_TO, elem_classes="markdown-text")
-    with gr.Box():
-        search_bar = gr.Textbox(placeholder="Search models and languages...", show_label=False, elem_id="search-bar")
-        leaderboard_table = gr.components.Dataframe(
-            value=original_df,
-            headers=COLS,
-            datatype=TYPES,
-            max_rows=5,
-            elem_id="leaderboard-table",
-        )
-        # # Dummy leaderboard for handling the case when the user uses backspace key
-        hidden_leaderboard_table_for_search = gr.components.Dataframe(
-            value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
-        )
-        search_bar.change(
-            search_table,
-            [hidden_leaderboard_table_for_search, search_bar],
-            leaderboard_table,
-        )
     gr.Markdown(CREDIT, elem_classes="markdown-text")
     gr.Markdown(CITATION, elem_classes="markdown-text")
-demo.launch()

 from collections import defaultdict
 from pathlib import Path
+import numpy as np
 import pandas as pd
 import gradio as gr
+from pandas import DataFrame
+from pandas.io.formats.style import Styler
 from content import *
 ARC = "arc"
 HELLASWAG = "hellaswag"
 METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
+def collect_results() -> dict[tuple[str, str], dict[str, float]]:
+    """
+    Collects results from the evals folder and returns a dictionary of results
+    :return: a dictionary of results where the keys are typles of (model_name, language) and the values are
+    dictionaries of the form {benchmark_name: performance_score}
+    """
     performance_dict = defaultdict(dict)
     for pfin in Path("evals").rglob("*.json"):
         data = json.loads(pfin.read_text(encoding="utf-8"))
+        if "results" not in data or "config" not in data:
             continue
         results = data["results"]
         config = data["config"]
             continue
         pretrained = pretrained[0].split("=")[1]
         pretrained = pretrained.split("/")[-1]
         for lang_task, perfs in results.items():
             task, lang = lang_task.split("_")
                 p = round(perfs[metric] * 100, 1)
                 performance_dict[(pretrained, lang)][task] = p
+    return dict(performance_dict)
+def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
+    """
+    Builds a dataframe from the performance dictionary
+    :param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
+    dictionaries of the form {benchmark_name: performance_score}
+    :return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
+    """
+    data = []
     for (pretrained, lang), perfs in performance_dict.items():
         arc_perf = perfs.get(ARC, 0.0)
         hellaswag_perf = perfs.get(HELLASWAG, 0.0)
         mmlu_perf = perfs.get(MMLU, 0.0)
         truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
         avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
+        row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
+        data.append(row)
+    df = pd.DataFrame.from_records(data, columns=COLS)
     df = df.sort_values(by=[AVERAGE_COL], ascending=False)
     return df
+def style_df(df: DataFrame) -> Styler:
+    """
+    Styles the dataframe by rounding to two decimals and putting the max value in bold per column
+    :param df: the dataframe to style
+    :return: the Styler
+    """
+    styler = df.style.format("{:.2f}", subset=df.columns[1:])
+    def highlight_max(col):
+        return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
+    styler = styler.apply(highlight_max, axis=1, subset=df.columns[1:])
+    return styler
 MODEL_COL = "Model"
 HELLASWAG_COL = "HellaSwag (10-shot)️"
 MMLU_COL = "MMLU (5-shot)"
 TRUTHFULQA_COL = "TruthfulQA (0-shot)"
+COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
+TYPES = ["str", "number", "number", "number", "number", "number"]
+results = collect_results()
+original_df = build_performance_df(results)
+styled_df = style_df(original_df)
+with gr.Blocks() as demo:
     gr.HTML(TITLE)
+    gr.Markdown(INTRO_TEXT)
+    gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
+    gr.components.Dataframe(
+        value=original_df,
+        headers=COLS,
+        datatype=TYPES,
+        elem_id="leaderboard-table",
+    )
+    gr.Markdown("## LaTeX")
+    gr.Code(styled_df.to_latex(convert_css=True))
     gr.Markdown(CREDIT, elem_classes="markdown-text")
     gr.Markdown(CITATION, elem_classes="markdown-text")
+if __name__ == '__main__':
+    demo.launch()

content.py CHANGED Viewed

@@ -1,44 +1,29 @@
-TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Leaderboard</h1>'
 INTRO_TEXT = f"""
 ## About
-This leaderboard tracks progress and ranks performance of large language models (LLMs) developed for different languages,
-emphasizing on non-English languages to democratize benefits of LLMs to broader society.
-Our current leaderboard provides evaluation data for 29 languages, i.e.,
-Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch,
-French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam,
-Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish,
-Tamil, Telugu, Ukrainian, and Vietnamese, that will be expanded along the way.
-Both multilingual and language-specific LLMs are welcome in this leaderboard.
-We currently evaluate models over four benchmarks:
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot)
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot)
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot)
 - <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot)
-The evaluation data was translated into these languages using ChatGPT (gpt-35-turbo).
 """
-HOW_TO = f"""
-## How to list your model performance on this leaderboard:
-Run the evaluation of your model using this repo: <a href="https://github.com/laiviet/lm-evaluation-harness" target="_blank">https://github.com/laiviet/lm-evaluation-harness</a>.
-And then, push the evaluation log and make a pull request.
-"""
 CREDIT = f"""
 ## Credit
-To make this website, we use the following resources:
 - Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
-- Funding and GPU access (Adobe Research)
 - Evaluation code (EleutherAI's lm_evaluation_harness repo)
 - Leaderboard code (Huggingface4's open_llm_leaderboard repo)
 """
@@ -46,12 +31,19 @@ To make this website, we use the following resources:
 CITATION = f"""
 ## Citation
-```
 @misc{{lai2023openllmbenchmark,
     author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
     title={{Open Multilingual LLM Evaluation Leaderboard}},
     year={{2023}}
 }}
 ```
-"""

+TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Leaderboard (Dutch only)</h1>'
 INTRO_TEXT = f"""
 ## About
+This is a fork of the [Open Multilingual LLM Evaluation Leaderboard](https://huggingface.co/spaces/uonlp/open_multilingual_llm_leaderboard), but restricted to only Dutch models and augmented with additional model results.
+We test the models on the following benchmarks **for the Dutch version only!!**, which have been translated into Dutch automatically by the original authors of the Open Multilingual LLM Evaluation Leaderboard with `gpt-35-turbo`.
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot)
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot)
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot)
 - <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot)
+I do not maintain those datasets, I only run benchmarks and add the results to this space. For questions regarding the test sets or running them yourself, see [the original Github repository](https://github.com/laiviet/lm-evaluation-harness).
 """
 CREDIT = f"""
 ## Credit
+This leaderboard has borrowed heavily from the following sources:
 - Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
 - Evaluation code (EleutherAI's lm_evaluation_harness repo)
 - Leaderboard code (Huggingface4's open_llm_leaderboard repo)
+- The multilingual version of the leaderboard (uonlp's open_multilingual_llm_leaderboard repo)
 """
 CITATION = f"""
 ## Citation
+If you use or cite the Dutch benchmark results or this specific leaderboard page, please cite the following paper:
+TDB
+If you use the multilingual benchmarks, please cite the following paper:
+```bibtex
 @misc{{lai2023openllmbenchmark,
     author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
     title={{Open Multilingual LLM Evaluation Leaderboard}},
     year={{2023}}
 }}
 ```
+"""

css.py DELETED Viewed

@@ -1,13 +0,0 @@
-CUSTOM_CSS = """
-/* Hides the final column */
-table td:last-child,
-table th:last-child {
-    display: none;
-}
-# table td:first-child,
-# table th:first-child {
-#     max-width: 400px;
-#     overflow: auto;
-#     white-space: nowrap;
-# }
-"""

evals/arc/arc_nl_Llama-2-7b-chat-hf.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "arc_nl": {
-      "acc": 0.3609923011120616,
-      "acc_stderr": 0.014053373664144792,
-      "acc_norm": 0.3618477331052181,
-      "acc_norm_stderr": 0.014060593893704966
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "arc_nl": {
+      "acc": 0.3550042771599658,
+      "acc_stderr": 0.014001474982174305,
+      "acc_norm": 0.3609923011120616,
+      "acc_norm_stderr": 0.014053373664144789
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/arc/arc_nl_Llama-2-7b-hf.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "arc_nl": {
-      "acc": 0.33704020530367834,
-      "acc_stderr": 0.013831300903580639,
-      "acc_norm": 0.3567151411462789,
-      "acc_norm_stderr": 0.014016546277185005
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "arc_nl": {
+      "acc": 0.33447390932420873,
+      "acc_stderr": 0.013805185437125271,
+      "acc_norm": 0.3558597091531223,
+      "acc_norm_stderr": 0.014009035017396714
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/arc/{arc_nl_Mistral-7B-v0.1.json → arc_nl_Orca-2-7b.json} RENAMED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "arc_nl": {
-      "acc": 0.42087254063301965,
-      "acc_stderr": 0.014445778557368833,
-      "acc_norm": 0.4294268605645851,
-      "acc_norm_stderr": 0.014483677397351059
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "arc_nl": {
+      "acc": 0.3661248930710009,
+      "acc_stderr": 0.014095972894279241,
+      "acc_norm": 0.3678357570573139,
+      "acc_norm_stderr": 0.014109788842173
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/{truthfulqa/truthfulqa_nl-Llama-2-13b-hf.json → arc/arc_nl_gpt2-large-dutch.json} RENAMED Viewed

@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_nl": {
-      "mc1": 0.2764331210191083,
-      "mc1_stderr": 0.01597262688062874,
-      "mc2": 0.4103755310313891,
-      "mc2_stderr": 0.014811313488625848
     }
   },
   "versions": {
-    "truthfulqa_nl": 1
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
+    "arc_nl": {
+      "acc": 0.20102651839178784,
+      "acc_stderr": 0.011726581781869408,
+      "acc_norm": 0.24037639007698888,
+      "acc_norm_stderr": 0.01250327289928353
     }
   },
   "versions": {
+    "arc_nl": 0
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/arc/arc_nl_gpt2-medium-dutch.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "results": {
+    "arc_nl": {
+      "acc": 0.21471343028229256,
+      "acc_stderr": 0.012014958326088981,
+      "acc_norm": 0.24294268605645852,
+      "acc_norm_stderr": 0.012548588352773891
+    }
+  },
+  "versions": {
+    "arc_nl": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

evals/arc/arc_nl_zephyr-7b-beta.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "arc_nl": {
-      "acc": 0.43798118049615054,
-      "acc_stderr": 0.01451716231691793,
-      "acc_norm": 0.4328485885372113,
-      "acc_norm_stderr": 0.01449759923259859
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "arc_nl": {
+      "acc": 0.4311377245508982,
+      "acc_stderr": 0.014490726457652989,
+      "acc_norm": 0.43199315654405473,
+      "acc_norm_stderr": 0.014494184864971338
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "hellaswag_nl": {
-      "acc": 0.38467350242849435,
-      "acc_stderr": 0.005054749888300686,
-      "acc_norm": 0.4823529411764706,
-      "acc_norm_stderr": 0.005191586180318448
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 64,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "hellaswag_nl": {
+      "acc": 0.3838100377765785,
+      "acc_stderr": 0.005052614927289456,
+      "acc_norm": 0.4819212088505127,
+      "acc_norm_stderr": 0.005191425828002782
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "hellaswag_nl": {
-      "acc": 0.3878035617916892,
-      "acc_stderr": 0.005062348307428708,
-      "acc_norm": 0.5000539665407447,
-      "acc_norm_stderr": 0.005194822688012659
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 64,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "hellaswag_nl": {
+      "acc": 0.386184565569347,
+      "acc_stderr": 0.00505844561828187,
+      "acc_norm": 0.4957366432811657,
+      "acc_norm_stderr": 0.0051946338704556266
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "hellaswag_nl": {
-      "acc": 0.43486238532110094,
-      "acc_stderr": 0.005150551758279897,
-      "acc_norm": 0.5676200755531571,
-      "acc_norm_stderr": 0.005147097096977192
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 64,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "hellaswag_nl": {
+      "acc": 0.4336751214247167,
+      "acc_stderr": 0.0051489159372014965,
+      "acc_norm": 0.5662169454937939,
+      "acc_norm_stderr": 0.005149065890785751
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/hellaswag/{hellaswag_nl_zephyr-7b-beta.json → hellaswag_nl_Orca-2-7b.json} RENAMED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "hellaswag_nl": {
-      "acc": 0.4478143550998381,
-      "acc_stderr": 0.005166450687025188,
-      "acc_norm": 0.575067458175931,
-      "acc_norm_stderr": 0.005135942094754352
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 64,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "hellaswag_nl": {
+      "acc": 0.38456556934700487,
+      "acc_stderr": 0.005054483938257531,
+      "acc_norm": 0.48041014570966,
+      "acc_norm_stderr": 0.005190834031799853
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/hellaswag/hellaswag_nl_gpt2-large-dutch.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "results": {
+    "hellaswag_nl": {
+      "acc": 0.3043712898003238,
+      "acc_stderr": 0.004780698091128437,
+      "acc_norm": 0.34279546681057743,
+      "acc_norm_stderr": 0.004931380767300367
+    }
+  },
+  "versions": {
+    "hellaswag_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

evals/hellaswag/hellaswag_nl_gpt2-medium-dutch.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "results": {
+    "hellaswag_nl": {
+      "acc": 0.31246627091203455,
+      "acc_stderr": 0.004815587775923881,
+      "acc_norm": 0.36438208310847275,
+      "acc_norm_stderr": 0.00500008398696681
+    }
+  },
+  "versions": {
+    "hellaswag_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

evals/hellaswag/hellaswag_nl_neural-chat-7b-v3-1.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "results": {
+    "hellaswag_nl": {
+      "acc": 0.44069077172153265,
+      "acc_stderr": 0.0051581467942195215,
+      "acc_norm": 0.5429033998920669,
+      "acc_norm_stderr": 0.005175663147811796
+    }
+  },
+  "versions": {
+    "hellaswag_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=Intel/neural-chat-7b-v3-1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

evals/{truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json → mmlu/mmlu_nl_Mistral-7B-v0.1.json} RENAMED Viewed

@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_nl": {
-      "mc1": 0.3070063694267516,
-      "mc1_stderr": 0.01647328769082192,
-      "mc2": 0.45280570817630444,
-      "mc2_stderr": 0.015014728029135574
     }
   },
   "versions": {
-    "truthfulqa_nl": 1
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 64,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
+    "mmlu_nl": {
+      "acc": 0.45974045685664416,
+      "acc_stderr": 0.004341759787221058,
+      "acc_norm": 0.36912802610609396,
+      "acc_norm_stderr": 0.0042040447899996366
     }
   },
   "versions": {
+    "mmlu_nl": 0
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/mmlu/mmlu_nl_gpt2-large-dutch.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "results": {
+    "mmlu_nl": {
+      "acc": 0.2301737876603172,
+      "acc_stderr": 0.003667182186959482,
+      "acc_norm": 0.2436821734841011,
+      "acc_norm_stderr": 0.0037400056232706905
+    }
+  },
+  "versions": {
+    "mmlu_nl": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

evals/mmlu/mmlu_nl_gpt2-medium-dutch.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "results": {
+    "mmlu_nl": {
+      "acc": 0.23343704940426502,
+      "acc_stderr": 0.0036852504856799066,
+      "acc_norm": 0.2483873415800258,
+      "acc_norm_stderr": 0.003764176503735655
+    }
+  },
+  "versions": {
+    "mmlu_nl": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}

evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_nl": {
-      "mc1": 0.2764331210191083,
-      "mc1_stderr": 0.01597262688062874,
-      "mc2": 0.4103755310313891,
-      "mc2_stderr": 0.014811313488625848
-    }
-  },
-  "versions": {
-    "truthfulqa_nl": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}

evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "truthfulqa_nl": {
-      "mc1": 0.2917197452229299,
-      "mc1_stderr": 0.016234071293195287,
-      "mc2": 0.4462996697687161,
-      "mc2_stderr": 0.016161710042968205
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 64,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "truthfulqa_nl": {
+      "mc1": 0.289171974522293,
+      "mc1_stderr": 0.016192068781346693,
+      "mc2": 0.4445882138885173,
+      "mc2_stderr": 0.016144169053565395
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "truthfulqa_nl": {
       "mc1": 0.28152866242038216,
       "mc1_stderr": 0.016062309899461683,
-      "mc2": 0.41626070733921117,
-      "mc2_stderr": 0.014914193769419527
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 64,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

     "truthfulqa_nl": {
       "mc1": 0.28152866242038216,
       "mc1_stderr": 0.016062309899461683,
+      "mc2": 0.41449853431238814,
+      "mc2_stderr": 0.014922005996963188
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/truthfulqa/{truthfulqa_nl-falcon-40b-ft-alpaca-dolly-dutch.json → truthfulqa_nl_Orca-2-7b.json} RENAMED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "truthfulqa_nl": {
-      "mc1": 0.310828025477707,
-      "mc1_stderr": 0.016529733724696277,
-      "mc2": 0.4460845208916539,
-      "mc2_stderr": 0.01476856418537487
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=BramVanroy/falcon-40b-ft-alpaca-dolly-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "truthfulqa_nl": {
+      "mc1": 0.3146496815286624,
+      "mc1_stderr": 0.01658486445168711,
+      "mc2": 0.4488463711895695,
+      "mc2_stderr": 0.016292493035951996
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_nl": {
-      "mc1": 0.310828025477707,
-      "mc1_stderr": 0.016529733724696277,
-      "mc2": 0.4460845208916539,
-      "mc2_stderr": 0.01476856418537487
-    }
-  },
-  "versions": {
-    "truthfulqa_nl": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=BramVanroy/falcon-40b-ft-alpaca-dolly-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}

evals/truthfulqa/truthfulqa_nl_falcon-40b.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_nl": {
-      "mc1": 0.2764331210191083,
-      "mc1_stderr": 0.01597262688062875,
-      "mc2": 0.4091336161450544,
-      "mc2_stderr": 0.014605140809282338
-    }
-  },
-  "versions": {
-    "truthfulqa_nl": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=tiiuae/falcon-40b,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}

evals/truthfulqa/{truthfulqa_nl-llama2-13b-ft-mc4_nl_cleaned_tiny.json → truthfulqa_nl_gpt2-large-dutch.json} RENAMED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "truthfulqa_nl": {
-      "mc1": 0.2751592356687898,
-      "mc1_stderr": 0.0159498029022655,
-      "mc2": 0.41816127879466414,
-      "mc2_stderr": 0.01474120131034505
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=BramVanroy/llama2-13b-ft-mc4_nl_cleaned_tiny,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "truthfulqa_nl": {
+      "mc1": 0.25987261146496815,
+      "mc1_stderr": 0.015663018533664023,
+      "mc2": 0.41961324970531233,
+      "mc2_stderr": 0.01509691194885121
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/truthfulqa/{truthfulqa_nl-falcon-40b.json → truthfulqa_nl_gpt2-medium-dutch.json} RENAMED Viewed

@@ -1,10 +1,10 @@
 {
   "results": {
     "truthfulqa_nl": {
-      "mc1": 0.2764331210191083,
-      "mc1_stderr": 0.01597262688062875,
-      "mc2": 0.4091336161450544,
-      "mc2_stderr": 0.014605140809282338
     }
   },
   "versions": {
@@ -12,8 +12,8 @@
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=tiiuae/falcon-40b,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
     "device": "cuda",
     "no_cache": false,
     "limit": null,

 {
   "results": {
     "truthfulqa_nl": {
+      "mc1": 0.2878980891719745,
+      "mc1_stderr": 0.0161708346142461,
+      "mc2": 0.4527386932512769,
+      "mc2_stderr": 0.015417954968769677
     }
   },
   "versions": {
   },
   "config": {
     "model": "hf-auto",
+    "model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
+    "batch_size": "auto",
     "device": "cuda",
     "no_cache": false,
     "limit": null,

evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_nl": {
-      "mc1": 0.2751592356687898,
-      "mc1_stderr": 0.0159498029022655,
-      "mc2": 0.41816127879466414,
-      "mc2_stderr": 0.01474120131034505
-    }
-  },
-  "versions": {
-    "truthfulqa_nl": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=BramVanroy/llama2-13b-ft-mc4_nl_cleaned_tiny,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 8,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}

evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_nl": {
-      "mc1": 0.3719745222929936,
-      "mc1_stderr": 0.0172618443903749,
-      "mc2": 0.5294532108691418,
-      "mc2_stderr": 0.016221848481192833
-    }
-  },
-  "versions": {
-    "truthfulqa_nl": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
-    "batch_size": 64,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}