diff --git "a/lm_evaluation_harness.ipynb" "b/lm_evaluation_harness.ipynb"
new file mode 100644--- /dev/null
+++ "b/lm_evaluation_harness.ipynb"
@@ -0,0 +1,713 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ac6wadk3rmkK"
+      },
+      "source": [
+        "# LM Evaluation Harness (by [EleutherAI](https://www.eleuther.ai/))\n",
+        "\n",
+        "This [`LM-Evaluation-Harness`](https://github.com/EleutherAI/lm-evaluation-harness) provides a unified framework to test generative language models on a large number of different evaluation tasks. For a complete list of available tasks, see the [task table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md), or scroll to the bottom of the page.\n",
+        "\n",
+        "1. Clone the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and install the necessary libraries (`sentencepiece` is required for the Llama tokenizer)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "UA5I86u91e0A",
+        "outputId": "d74b3cab-b292-43db-bd5d-523424d2c97a"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Cloning into 'lm-evaluation-harness'...\n",
+            "remote: Enumerating objects: 22343, done.\u001b[K\n",
+            "remote: Counting objects: 100% (7096/7096), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (703/703), done.\u001b[K\n",
+            "remote: Total 22343 (delta 6540), reused 6659 (delta 6392), pack-reused 15247\u001b[K\n",
+            "Receiving objects: 100% (22343/22343), 20.57 MiB | 11.37 MiB/s, done.\n",
+            "Resolving deltas: 100% (15456/15456), done.\n",
+            "Obtaining file:///content/lm-evaluation-harness\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Collecting datasets>=2.0.0 (from lm-eval==0.3.0)\n",
+            "  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.6/519.6 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting einops (from lm-eval==0.3.0)\n",
+            "  Downloading einops-0.7.0-py3-none-any.whl (44 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting jsonlines (from lm-eval==0.3.0)\n",
+            "  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
+            "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==0.3.0) (2.8.7)\n",
+            "Collecting openai>=0.6.4 (from lm-eval==0.3.0)\n",
+            "  Downloading openai-0.28.1-py3-none-any.whl (76 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.0/77.0 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting omegaconf>=2.2 (from lm-eval==0.3.0)\n",
+            "  Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting peft>=0.2.0 (from lm-eval==0.3.0)\n",
+            "  Downloading peft-0.5.0-py3-none-any.whl (85 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.6/85.6 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==0.3.0)\n",
+            "  Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m26.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting pycountry (from lm-eval==0.3.0)\n",
+            "  Downloading pycountry-22.3.5.tar.gz (10.1 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.1/10.1 MB\u001b[0m \u001b[31m85.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+            "Collecting pytablewriter (from lm-eval==0.3.0)\n",
+            "  Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==0.3.0)\n",
+            "  Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Collecting sacrebleu==1.5.0 (from lm-eval==0.3.0)\n",
+            "  Downloading sacrebleu-1.5.0-py3-none-any.whl (65 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.6/65.6 kB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==0.3.0) (1.2.2)\n",
+            "Collecting sqlitedict (from lm-eval==0.3.0)\n",
+            "  Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Requirement already satisfied: torch>=1.7 in /usr/local/lib/python3.10/dist-packages (from lm-eval==0.3.0) (2.0.1+cu118)\n",
+            "Collecting tqdm-multiprocess (from lm-eval==0.3.0)\n",
+            "  Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n",
+            "Collecting transformers>=4.1 (from lm-eval==0.3.0)\n",
+            "  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.7/7.7 MB\u001b[0m \u001b[31m63.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting zstandard (from lm-eval==0.3.0)\n",
+            "  Downloading zstandard-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m85.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting accelerate>=0.17.1 (from lm-eval==0.3.0)\n",
+            "  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m258.1/258.1 kB\u001b[0m \u001b[31m25.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting portalocker (from sacrebleu==1.5.0->lm-eval==0.3.0)\n",
+            "  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.17.1->lm-eval==0.3.0) (1.23.5)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.17.1->lm-eval==0.3.0) (23.2)\n",
+            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.17.1->lm-eval==0.3.0) (5.9.5)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.17.1->lm-eval==0.3.0) (6.0.1)\n",
+            "Collecting huggingface-hub (from accelerate>=0.17.1->lm-eval==0.3.0)\n",
+            "  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.0/302.0 kB\u001b[0m \u001b[31m27.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (9.0.0)\n",
+            "Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==0.3.0)\n",
+            "  Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (1.5.3)\n",
+            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (2.31.0)\n",
+            "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (4.66.1)\n",
+            "Collecting xxhash (from datasets>=2.0.0->lm-eval==0.3.0)\n",
+            "  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m21.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting multiprocess (from datasets>=2.0.0->lm-eval==0.3.0)\n",
+            "  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m16.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: fsspec[http]<2023.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (2023.6.0)\n",
+            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (3.8.6)\n",
+            "Collecting antlr4-python3-runtime==4.9.* (from omegaconf>=2.2->lm-eval==0.3.0)\n",
+            "  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.0/117.0 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Collecting safetensors (from peft>=0.2.0->lm-eval==0.3.0)\n",
+            "  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m66.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==0.3.0) (1.4.0)\n",
+            "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==0.3.0) (3.8.1)\n",
+            "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==0.3.0) (1.16.0)\n",
+            "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==0.3.0) (1.11.3)\n",
+            "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==0.3.0) (1.3.2)\n",
+            "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==0.3.0) (3.2.0)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (3.12.4)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (4.5.0)\n",
+            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (1.12)\n",
+            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (3.1)\n",
+            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (3.1.2)\n",
+            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (2.0.0)\n",
+            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.7->lm-eval==0.3.0) (3.27.6)\n",
+            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.7->lm-eval==0.3.0) (17.0.2)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm-eval==0.3.0) (2023.6.3)\n",
+            "Collecting tokenizers<0.15,>=0.14 (from transformers>=4.1->lm-eval==0.3.0)\n",
+            "  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m118.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm-eval==0.3.0) (23.1.0)\n",
+            "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from pycountry->lm-eval==0.3.0) (67.7.2)\n",
+            "Collecting DataProperty<2,>=1.0.1 (from pytablewriter->lm-eval==0.3.0)\n",
+            "  Downloading DataProperty-1.0.1-py3-none-any.whl (27 kB)\n",
+            "Collecting mbstrdecoder<2,>=1.0.0 (from pytablewriter->lm-eval==0.3.0)\n",
+            "  Downloading mbstrdecoder-1.1.3-py3-none-any.whl (7.8 kB)\n",
+            "Collecting pathvalidate<4,>=2.3.0 (from pytablewriter->lm-eval==0.3.0)\n",
+            "  Downloading pathvalidate-3.2.0-py3-none-any.whl (23 kB)\n",
+            "Collecting tabledata<2,>=1.3.1 (from pytablewriter->lm-eval==0.3.0)\n",
+            "  Downloading tabledata-1.3.3-py3-none-any.whl (11 kB)\n",
+            "Collecting tcolorpy<1,>=0.0.5 (from pytablewriter->lm-eval==0.3.0)\n",
+            "  Downloading tcolorpy-0.1.4-py3-none-any.whl (7.9 kB)\n",
+            "Collecting typepy[datetime]<2,>=1.3.2 (from pytablewriter->lm-eval==0.3.0)\n",
+            "  Downloading typepy-1.3.2-py3-none-any.whl (31 kB)\n",
+            "Collecting colorama (from tqdm-multiprocess->lm-eval==0.3.0)\n",
+            "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+            "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (3.3.0)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (6.0.4)\n",
+            "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (4.0.3)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (1.9.2)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (1.4.0)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (1.3.1)\n",
+            "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm-eval==0.3.0) (5.2.0)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==0.3.0) (3.4)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==0.3.0) (2.0.6)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==0.3.0) (2023.7.22)\n",
+            "Collecting huggingface-hub (from accelerate>=0.17.1->lm-eval==0.3.0)\n",
+            "  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m34.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==0.3.0) (2.8.2)\n",
+            "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==0.3.0) (2023.3.post1)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.7->lm-eval==0.3.0) (2.1.3)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==0.3.0) (8.1.7)\n",
+            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.7->lm-eval==0.3.0) (1.3.0)\n",
+            "Building wheels for collected packages: antlr4-python3-runtime, rouge-score, pycountry, sqlitedict\n",
+            "  Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=a2f0b8953193e72a5cc4d402cd57becdaf2e11c29b664a7bc1dd0a2be7b14c34\n",
+            "  Stored in directory: /root/.cache/pip/wheels/12/93/dd/1f6a127edc45659556564c5730f6d4e300888f4bca2d4c5a88\n",
+            "  Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=b800533290e8b115b69386f5528faaeec21bdaf0b27df954f91293ce884d2fae\n",
+            "  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
+            "  Building wheel for pycountry (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for pycountry: filename=pycountry-22.3.5-py2.py3-none-any.whl size=10681833 sha256=c76dd8d8880795167eba1833e4b4f85fd1d2989d3e3c2a3c14ac581d784ec607\n",
+            "  Stored in directory: /root/.cache/pip/wheels/03/57/cc/290c5252ec97a6d78d36479a3c5e5ecc76318afcb241ad9dbe\n",
+            "  Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16864 sha256=38ab29686a73c7df8c33252ed6b8986475d0548f9adf841373e4ecaf8d995201\n",
+            "  Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n",
+            "Successfully built antlr4-python3-runtime rouge-score pycountry sqlitedict\n",
+            "Installing collected packages: sqlitedict, antlr4-python3-runtime, zstandard, xxhash, tcolorpy, safetensors, pycountry, pybind11, portalocker, pathvalidate, omegaconf, mbstrdecoder, jsonlines, einops, dill, colorama, typepy, tqdm-multiprocess, sacrebleu, rouge-score, multiprocess, huggingface-hub, tokenizers, openai, transformers, datasets, DataProperty, tabledata, pytablewriter, accelerate, peft, lm-eval\n",
+            "  Running setup.py develop for lm-eval\n",
+            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+            "llmx 0.0.15a0 requires cohere, which is not installed.\n",
+            "llmx 0.0.15a0 requires tiktoken, which is not installed.\u001b[0m\u001b[31m\n",
+            "\u001b[0mSuccessfully installed DataProperty-1.0.1 accelerate-0.23.0 antlr4-python3-runtime-4.9.3 colorama-0.4.6 datasets-2.14.5 dill-0.3.7 einops-0.7.0 huggingface-hub-0.17.3 jsonlines-4.0.0 lm-eval-0.3.0 mbstrdecoder-1.1.3 multiprocess-0.70.15 omegaconf-2.3.0 openai-0.28.1 pathvalidate-3.2.0 peft-0.5.0 portalocker-2.8.2 pybind11-2.11.1 pycountry-22.3.5 pytablewriter-1.2.0 rouge-score-0.1.2 sacrebleu-1.5.0 safetensors-0.4.0 sqlitedict-2.1.0 tabledata-1.3.3 tcolorpy-0.1.4 tokenizers-0.14.1 tqdm-multiprocess-0.0.11 transformers-4.34.0 typepy-1.3.2 xxhash-3.4.1 zstandard-0.21.0\n",
+            "Collecting cohere\n",
+            "  Downloading cohere-4.30-py3-none-any.whl (47 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.8/47.8 kB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting tiktoken\n",
+            "  Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m30.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting sentencepiece\n",
+            "  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m75.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: aiohttp<4.0,>=3.0 in /usr/local/lib/python3.10/dist-packages (from cohere) (3.8.6)\n",
+            "Collecting backoff<3.0,>=2.0 (from cohere)\n",
+            "  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
+            "Collecting fastavro==1.8.2 (from cohere)\n",
+            "  Downloading fastavro-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m97.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: importlib_metadata<7.0,>=6.0 in /usr/local/lib/python3.10/dist-packages (from cohere) (6.8.0)\n",
+            "Requirement already satisfied: requests<3.0.0,>=2.25.0 in /usr/local/lib/python3.10/dist-packages (from cohere) (2.31.0)\n",
+            "Requirement already satisfied: urllib3<3,>=1.26 in /usr/local/lib/python3.10/dist-packages (from cohere) (2.0.6)\n",
+            "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2023.6.3)\n",
+            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (23.1.0)\n",
+            "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (3.3.0)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (6.0.4)\n",
+            "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (4.0.3)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (1.9.2)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (1.4.0)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (1.3.1)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib_metadata<7.0,>=6.0->cohere) (3.17.0)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.25.0->cohere) (3.4)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.25.0->cohere) (2023.7.22)\n",
+            "Installing collected packages: sentencepiece, fastavro, backoff, tiktoken, cohere\n",
+            "Successfully installed backoff-2.2.1 cohere-4.30 fastavro-1.8.2 sentencepiece-0.1.99 tiktoken-0.5.1\n"
+          ]
+        }
+      ],
+      "source": [
+        "%git clone https://github.com/EleutherAI/lm-evaluation-harness\n",
+        "%cd lm-evaluation-harness && pip install -e .\n",
+        "%pip install cohere tiktoken sentencepiece"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "pnHoAVK25QZn",
+        "outputId": "4253b115-702c-4f31-f1b3-f0483c527841"
+      },
+      "outputs": [],
+      "source": [
+        "%cd lm-evaluation-harness && python main.py \\\n",
+        "    --model hf-causal \\\n",
+        "    --model_args pretrained=nicholasKluge/Aira-2-1B1 \\\n",
+        "    --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions  \\\n",
+        "    --device cuda:0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4Bm78wiZ4Own"
+      },
+      "source": [
+        "## Task Table 📚\n",
+        "\n",
+        "|                        Task Name                        |Train|Val|Test|Val/Test Docs|                                                                                     Metrics                                                                                     |\n",
+        "|---------------------------------------------------------|-----|---|----|------------:|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n",
+        "|anagrams1                                                |     |✓  |    |        10000|acc                                                                                                                                                                              |\n",
+        "|anagrams2                                                |     |✓  |    |        10000|acc                                                                                                                                                                              |\n",
+        "|anli_r1                                                  |✓    |✓  |✓   |         1000|acc                                                                                                                                                                              |\n",
+        "|anli_r2                                                  |✓    |✓  |✓   |         1000|acc                                                                                                                                                                              |\n",
+        "|anli_r3                                                  |✓    |✓  |✓   |         1200|acc                                                                                                                                                                              |\n",
+        "|arc_challenge                                            |✓    |✓  |✓   |         1172|acc, acc_norm                                                                                                                                                                    |\n",
+        "|arc_easy                                                 |✓    |✓  |✓   |         2376|acc, acc_norm                                                                                                                                                                    |\n",
+        "|arithmetic_1dc                                           |     |✓  |    |         2000|acc                                                                                                                                                                              |\n",
+        "|arithmetic_2da                                           |     |✓  |    |         2000|acc                                                                                                                                                                              |\n",
+        "|arithmetic_2dm                                           |     |✓  |    |         2000|acc                                                                                                                                                                              |\n",
+        "|arithmetic_2ds                                           |     |✓  |    |         2000|acc                                                                                                                                                                              |\n",
+        "|arithmetic_3da                                           |     |✓  |    |         2000|acc                                                                                                                                                                              |\n",
+        "|arithmetic_3ds                                           |     |✓  |    |         2000|acc                                                                                                                                                                              |\n",
+        "|arithmetic_4da                                           |     |✓  |    |         2000|acc                                                                                                                                                                              |\n",
+        "|arithmetic_4ds                                           |     |✓  |    |         2000|acc                                                                                                                                                                              |\n",
+        "|arithmetic_5da                                           |     |✓  |    |         2000|acc                                                                                                                                                                              |\n",
+        "|arithmetic_5ds                                           |     |✓  |    |         2000|acc                                                                                                                                                                              |\n",
+        "|bigbench_causal_judgement                                |     |   |✓   |          190|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_date_understanding                              |     |   |✓   |          369|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_disambiguation_qa                               |     |   |✓   |          258|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_dyck_languages                                  |     |   |✓   |         1000|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_formal_fallacies_syllogisms_negation            |     |   |✓   |        14200|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_geometric_shapes                                |     |   |✓   |          359|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_hyperbaton                                      |     |   |✓   |        50000|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_logical_deduction_five_objects                  |     |   |✓   |          500|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_logical_deduction_seven_objects                 |     |   |✓   |          700|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_logical_deduction_three_objects                 |     |   |✓   |          300|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_movie_recommendation                            |     |   |✓   |          500|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_navigate                                        |     |   |✓   |         1000|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_reasoning_about_colored_objects                 |     |   |✓   |         2000|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_ruin_names                                      |     |   |✓   |          448|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_salient_translation_error_detection             |     |   |✓   |          998|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_snarks                                          |     |   |✓   |          181|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_sports_understanding                            |     |   |✓   |          986|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_temporal_sequences                              |     |   |✓   |         1000|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_tracking_shuffled_objects_five_objects          |     |   |✓   |         1250|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_tracking_shuffled_objects_seven_objects         |     |   |✓   |         1750|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|bigbench_tracking_shuffled_objects_three_objects         |     |   |✓   |          300|multiple_choice_grade, exact_str_match                                                                                                                                           |\n",
+        "|blimp_adjunct_island                                     |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_anaphor_gender_agreement                           |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_anaphor_number_agreement                           |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_animate_subject_passive                            |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_animate_subject_trans                              |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_causative                                          |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_complex_NP_island                                  |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_coordinate_structure_constraint_complex_left_branch|     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_coordinate_structure_constraint_object_extraction  |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_determiner_noun_agreement_1                        |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_determiner_noun_agreement_2                        |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_determiner_noun_agreement_irregular_1              |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_determiner_noun_agreement_irregular_2              |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_determiner_noun_agreement_with_adj_2               |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_determiner_noun_agreement_with_adj_irregular_1     |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_determiner_noun_agreement_with_adj_irregular_2     |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_determiner_noun_agreement_with_adjective_1         |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_distractor_agreement_relational_noun               |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_distractor_agreement_relative_clause               |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_drop_argument                                      |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_ellipsis_n_bar_1                                   |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_ellipsis_n_bar_2                                   |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_existential_there_object_raising                   |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_existential_there_quantifiers_1                    |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_existential_there_quantifiers_2                    |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_existential_there_subject_raising                  |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_expletive_it_object_raising                        |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_inchoative                                         |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_intransitive                                       |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_irregular_past_participle_adjectives               |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_irregular_past_participle_verbs                    |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_irregular_plural_subject_verb_agreement_1          |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_irregular_plural_subject_verb_agreement_2          |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_left_branch_island_echo_question                   |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_left_branch_island_simple_question                 |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_matrix_question_npi_licensor_present               |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_npi_present_1                                      |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_npi_present_2                                      |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_only_npi_licensor_present                          |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_only_npi_scope                                     |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_passive_1                                          |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_passive_2                                          |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_principle_A_c_command                              |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_principle_A_case_1                                 |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_principle_A_case_2                                 |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_principle_A_domain_1                               |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_principle_A_domain_2                               |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_principle_A_domain_3                               |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_principle_A_reconstruction                         |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_regular_plural_subject_verb_agreement_1            |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_regular_plural_subject_verb_agreement_2            |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_sentential_negation_npi_licensor_present           |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_sentential_negation_npi_scope                      |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_sentential_subject_island                          |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_superlative_quantifiers_1                          |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_superlative_quantifiers_2                          |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_tough_vs_raising_1                                 |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_tough_vs_raising_2                                 |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_transitive                                         |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_wh_island                                          |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_wh_questions_object_gap                            |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_wh_questions_subject_gap                           |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_wh_questions_subject_gap_long_distance             |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_wh_vs_that_no_gap                                  |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_wh_vs_that_no_gap_long_distance                    |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_wh_vs_that_with_gap                                |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|blimp_wh_vs_that_with_gap_long_distance                  |     |✓  |    |         1000|acc                                                                                                                                                                              |\n",
+        "|boolq                                                    |✓    |✓  |    |         3270|acc                                                                                                                                                                              |\n",
+        "|cb                                                       |✓    |✓  |    |           56|acc, f1                                                                                                                                                                          |\n",
+        "|cola                                                     |✓    |✓  |    |         1043|mcc                                                                                                                                                                              |\n",
+        "|copa                                                     |✓    |✓  |    |          100|acc                                                                                                                                                                              |\n",
+        "|coqa                                                     |✓    |✓  |    |          500|f1, em                                                                                                                                                                           |\n",
+        "|crows_pairs_english                                      |     |✓  |    |         1677|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_english_age                                  |     |✓  |    |           91|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_english_autre                                |     |✓  |    |           11|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_english_disability                           |     |✓  |    |           65|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_english_gender                               |     |✓  |    |          320|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_english_nationality                          |     |✓  |    |          216|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_english_physical_appearance                  |     |✓  |    |           72|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_english_race_color                           |     |✓  |    |          508|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_english_religion                             |     |✓  |    |          111|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_english_sexual_orientation                   |     |✓  |    |           93|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_english_socioeconomic                        |     |✓  |    |          190|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french                                       |     |✓  |    |         1677|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french_age                                   |     |✓  |    |           90|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french_autre                                 |     |✓  |    |           13|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french_disability                            |     |✓  |    |           66|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french_gender                                |     |✓  |    |          321|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french_nationality                           |     |✓  |    |          253|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french_physical_appearance                   |     |✓  |    |           72|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french_race_color                            |     |✓  |    |          460|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french_religion                              |     |✓  |    |          115|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french_sexual_orientation                    |     |✓  |    |           91|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|crows_pairs_french_socioeconomic                         |     |✓  |    |          196|likelihood_difference, pct_stereotype                                                                                                                                            |\n",
+        "|cycle_letters                                            |     |✓  |    |        10000|acc                                                                                                                                                                              |\n",
+        "|drop                                                     |✓    |✓  |    |         9536|em, f1                                                                                                                                                                           |\n",
+        "|ethics_cm                                                |✓    |   |✓   |         3885|acc                                                                                                                                                                              |\n",
+        "|ethics_deontology                                        |✓    |   |✓   |         3596|acc, em                                                                                                                                                                          |\n",
+        "|ethics_justice                                           |✓    |   |✓   |         2704|acc, em                                                                                                                                                                          |\n",
+        "|ethics_utilitarianism                                    |✓    |   |✓   |         4808|acc                                                                                                                                                                              |\n",
+        "|ethics_utilitarianism_original                           |     |   |✓   |         4808|acc                                                                                                                                                                              |\n",
+        "|ethics_virtue                                            |✓    |   |✓   |         4975|acc, em                                                                                                                                                                          |\n",
+        "|gsm8k                                                    |✓    |   |✓   |         1319|acc                                                                                                                                                                              |\n",
+        "|headqa                                                   |✓    |✓  |✓   |         2742|acc, acc_norm                                                                                                                                                                    |\n",
+        "|headqa_en                                                |✓    |✓  |✓   |         2742|acc, acc_norm                                                                                                                                                                    |\n",
+        "|headqa_es                                                |✓    |✓  |✓   |         2742|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hellaswag                                                |✓    |✓  |    |        10042|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-abstract_algebra                           |     |✓  |✓   |          100|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-anatomy                                    |     |✓  |✓   |          135|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-astronomy                                  |     |✓  |✓   |          152|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-business_ethics                            |     |✓  |✓   |          100|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-clinical_knowledge                         |     |✓  |✓   |          265|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-college_biology                            |     |✓  |✓   |          144|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-college_chemistry                          |     |✓  |✓   |          100|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-college_computer_science                   |     |✓  |✓   |          100|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-college_mathematics                        |     |✓  |✓   |          100|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-college_medicine                           |     |✓  |✓   |          173|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-college_physics                            |     |✓  |✓   |          102|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-computer_security                          |     |✓  |✓   |          100|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-conceptual_physics                         |     |✓  |✓   |          235|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-econometrics                               |     |✓  |✓   |          114|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-electrical_engineering                     |     |✓  |✓   |          145|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-elementary_mathematics                     |     |✓  |✓   |          378|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-formal_logic                               |     |✓  |✓   |          126|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-global_facts                               |     |✓  |✓   |          100|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_biology                        |     |✓  |✓   |          310|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_chemistry                      |     |✓  |✓   |          203|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_computer_science               |     |✓  |✓   |          100|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_european_history               |     |✓  |✓   |          165|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_geography                      |     |✓  |✓   |          198|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_government_and_politics        |     |✓  |✓   |          193|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_macroeconomics                 |     |✓  |✓   |          390|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_mathematics                    |     |✓  |✓   |          270|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_microeconomics                 |     |✓  |✓   |          238|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_physics                        |     |✓  |✓   |          151|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_psychology                     |     |✓  |✓   |          545|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_statistics                     |     |✓  |✓   |          216|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_us_history                     |     |✓  |✓   |          204|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-high_school_world_history                  |     |✓  |✓   |          237|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-human_aging                                |     |✓  |✓   |          223|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-human_sexuality                            |     |✓  |✓   |          131|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-international_law                          |     |✓  |✓   |          121|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-jurisprudence                              |     |✓  |✓   |          108|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-logical_fallacies                          |     |✓  |✓   |          163|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-machine_learning                           |     |✓  |✓   |          112|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-management                                 |     |✓  |✓   |          103|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-marketing                                  |     |✓  |✓   |          234|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-medical_genetics                           |     |✓  |✓   |          100|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-miscellaneous                              |     |✓  |✓   |          783|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-moral_disputes                             |     |✓  |✓   |          346|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-moral_scenarios                            |     |✓  |✓   |          895|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-nutrition                                  |     |✓  |✓   |          306|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-philosophy                                 |     |✓  |✓   |          311|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-prehistory                                 |     |✓  |✓   |          324|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-professional_accounting                    |     |✓  |✓   |          282|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-professional_law                           |     |✓  |✓   |         1534|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-professional_medicine                      |     |✓  |✓   |          272|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-professional_psychology                    |     |✓  |✓   |          612|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-public_relations                           |     |✓  |✓   |          110|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-security_studies                           |     |✓  |✓   |          245|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-sociology                                  |     |✓  |✓   |          201|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-us_foreign_policy                          |     |✓  |✓   |          100|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-virology                                   |     |✓  |✓   |          166|acc, acc_norm                                                                                                                                                                    |\n",
+        "|hendrycksTest-world_religions                            |     |✓  |✓   |          171|acc, acc_norm                                                                                                                                                                    |\n",
+        "|iwslt17-ar-en                                            |     |   |✓   |         1460|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|iwslt17-en-ar                                            |     |   |✓   |         1460|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|lambada_openai                                           |     |   |✓   |         5153|ppl, acc                                                                                                                                                                         |\n",
+        "|lambada_openai_cloze                                     |     |   |✓   |         5153|ppl, acc                                                                                                                                                                         |\n",
+        "|lambada_openai_mt_de                                     |     |   |✓   |         5153|ppl, acc                                                                                                                                                                         |\n",
+        "|lambada_openai_mt_en                                     |     |   |✓   |         5153|ppl, acc                                                                                                                                                                         |\n",
+        "|lambada_openai_mt_es                                     |     |   |✓   |         5153|ppl, acc                                                                                                                                                                         |\n",
+        "|lambada_openai_mt_fr                                     |     |   |✓   |         5153|ppl, acc                                                                                                                                                                         |\n",
+        "|lambada_openai_mt_it                                     |     |   |✓   |         5153|ppl, acc                                                                                                                                                                         |\n",
+        "|lambada_standard                                         |     |✓  |✓   |         5153|ppl, acc                                                                                                                                                                         |\n",
+        "|lambada_standard_cloze                                   |     |✓  |✓   |         5153|ppl, acc                                                                                                                                                                         |\n",
+        "|logiqa                                                   |✓    |✓  |✓   |          651|acc, acc_norm                                                                                                                                                                    |\n",
+        "|math_algebra                                             |✓    |   |✓   |         1187|acc                                                                                                                                                                              |\n",
+        "|math_asdiv                                               |     |✓  |    |         2305|acc                                                                                                                                                                              |\n",
+        "|math_counting_and_prob                                   |✓    |   |✓   |          474|acc                                                                                                                                                                              |\n",
+        "|math_geometry                                            |✓    |   |✓   |          479|acc                                                                                                                                                                              |\n",
+        "|math_intermediate_algebra                                |✓    |   |✓   |          903|acc                                                                                                                                                                              |\n",
+        "|math_num_theory                                          |✓    |   |✓   |          540|acc                                                                                                                                                                              |\n",
+        "|math_prealgebra                                          |✓    |   |✓   |          871|acc                                                                                                                                                                              |\n",
+        "|math_precalc                                             |✓    |   |✓   |          546|acc                                                                                                                                                                              |\n",
+        "|mathqa                                                   |✓    |✓  |✓   |         2985|acc, acc_norm                                                                                                                                                                    |\n",
+        "|mc_taco                                                  |     |✓  |✓   |         9442|f1, em                                                                                                                                                                           |\n",
+        "|mgsm_bn                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mgsm_de                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mgsm_en                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mgsm_es                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mgsm_fr                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mgsm_ja                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mgsm_ru                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mgsm_sw                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mgsm_te                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mgsm_th                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mgsm_zh                                                  |✓    |   |✓   |          250|acc                                                                                                                                                                              |\n",
+        "|mnli                                                     |✓    |✓  |    |         9815|acc                                                                                                                                                                              |\n",
+        "|mnli_mismatched                                          |✓    |✓  |    |         9832|acc                                                                                                                                                                              |\n",
+        "|mrpc                                                     |✓    |✓  |    |          408|acc, f1                                                                                                                                                                          |\n",
+        "|multirc                                                  |✓    |✓  |    |         4848|acc                                                                                                                                                                              |\n",
+        "|mutual                                                   |✓    |✓  |    |          886|r@1, r@2, mrr                                                                                                                                                                    |\n",
+        "|mutual_plus                                              |✓    |✓  |    |          886|r@1, r@2, mrr                                                                                                                                                                    |\n",
+        "|openbookqa                                               |✓    |✓  |✓   |          500|acc, acc_norm                                                                                                                                                                    |\n",
+        "|pawsx_de                                                 |✓    |✓  |✓   |         2000|acc                                                                                                                                                                              |\n",
+        "|pawsx_en                                                 |✓    |✓  |✓   |         2000|acc                                                                                                                                                                              |\n",
+        "|pawsx_es                                                 |✓    |✓  |✓   |         2000|acc                                                                                                                                                                              |\n",
+        "|pawsx_fr                                                 |✓    |✓  |✓   |         2000|acc                                                                                                                                                                              |\n",
+        "|pawsx_ja                                                 |✓    |✓  |✓   |         2000|acc                                                                                                                                                                              |\n",
+        "|pawsx_ko                                                 |✓    |✓  |✓   |         2000|acc                                                                                                                                                                              |\n",
+        "|pawsx_zh                                                 |✓    |✓  |✓   |         2000|acc                                                                                                                                                                              |\n",
+        "|pile_arxiv                                               |     |✓  |✓   |         2407|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_bookcorpus2                                         |     |✓  |✓   |           28|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_books3                                              |     |✓  |✓   |          269|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_dm-mathematics                                      |     |✓  |✓   |         1922|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_enron                                               |     |✓  |✓   |         1010|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_europarl                                            |     |✓  |✓   |          157|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_freelaw                                             |     |✓  |✓   |         5101|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_github                                              |     |✓  |✓   |        18195|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_gutenberg                                           |     |✓  |✓   |           80|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_hackernews                                          |     |✓  |✓   |         1632|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_nih-exporter                                        |     |✓  |✓   |         1884|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_opensubtitles                                       |     |✓  |✓   |          642|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_openwebtext2                                        |     |✓  |✓   |        32925|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_philpapers                                          |     |✓  |✓   |           68|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_pile-cc                                             |     |✓  |✓   |        52790|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_pubmed-abstracts                                    |     |✓  |✓   |        29895|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_pubmed-central                                      |     |✓  |✓   |         5911|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_stackexchange                                       |     |✓  |✓   |        30378|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_ubuntu-irc                                          |     |✓  |✓   |           22|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_uspto                                               |     |✓  |✓   |        11415|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_wikipedia                                           |     |✓  |✓   |        17511|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|pile_youtubesubtitles                                    |     |✓  |✓   |          342|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|piqa                                                     |✓    |✓  |    |         1838|acc, acc_norm                                                                                                                                                                    |\n",
+        "|prost                                                    |     |   |✓   |        18736|acc, acc_norm                                                                                                                                                                    |\n",
+        "|pubmedqa                                                 |     |   |✓   |         1000|acc                                                                                                                                                                              |\n",
+        "|qa4mre_2011                                              |     |   |✓   |          120|acc, acc_norm                                                                                                                                                                    |\n",
+        "|qa4mre_2012                                              |     |   |✓   |          160|acc, acc_norm                                                                                                                                                                    |\n",
+        "|qa4mre_2013                                              |     |   |✓   |          284|acc, acc_norm                                                                                                                                                                    |\n",
+        "|qasper                                                   |✓    |✓  |    |         1764|f1_yesno, f1_abstractive                                                                                                                                                         |\n",
+        "|qnli                                                     |✓    |✓  |    |         5463|acc                                                                                                                                                                              |\n",
+        "|qqp                                                      |✓    |✓  |    |        40430|acc, f1                                                                                                                                                                          |\n",
+        "|race                                                     |✓    |✓  |✓   |         1045|acc                                                                                                                                                                              |\n",
+        "|random_insertion                                         |     |✓  |    |        10000|acc                                                                                                                                                                              |\n",
+        "|record                                                   |✓    |✓  |    |        10000|f1, em                                                                                                                                                                           |\n",
+        "|reversed_words                                           |     |✓  |    |        10000|acc                                                                                                                                                                              |\n",
+        "|rte                                                      |✓    |✓  |    |          277|acc                                                                                                                                                                              |\n",
+        "|sciq                                                     |✓    |✓  |✓   |         1000|acc, acc_norm                                                                                                                                                                    |\n",
+        "|scrolls_contractnli                                      |✓    |✓  |    |         1037|em, acc, acc_norm                                                                                                                                                                |\n",
+        "|scrolls_govreport                                        |✓    |✓  |    |          972|rouge1, rouge2, rougeL                                                                                                                                                           |\n",
+        "|scrolls_narrativeqa                                      |✓    |✓  |    |         3425|f1                                                                                                                                                                               |\n",
+        "|scrolls_qasper                                           |✓    |✓  |    |          984|f1                                                                                                                                                                               |\n",
+        "|scrolls_qmsum                                            |✓    |✓  |    |          272|rouge1, rouge2, rougeL                                                                                                                                                           |\n",
+        "|scrolls_quality                                          |✓    |✓  |    |         2086|em, acc, acc_norm                                                                                                                                                                |\n",
+        "|scrolls_summscreenfd                                     |✓    |✓  |    |          338|rouge1, rouge2, rougeL                                                                                                                                                           |\n",
+        "|squad2                                                   |✓    |✓  |    |        11873|exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1                                                                                                   |\n",
+        "|sst                                                      |✓    |✓  |    |          872|acc                                                                                                                                                                              |\n",
+        "|swag                                                     |✓    |✓  |    |        20006|acc, acc_norm                                                                                                                                                                    |\n",
+        "|toxigen                                                  |✓    |   |✓   |          940|acc, acc_norm                                                                                                                                                                    |\n",
+        "|triviaqa                                                 |✓    |✓  |    |        11313|acc                                                                                                                                                                              |\n",
+        "|truthfulqa_gen                                           |     |✓  |    |          817|bleurt_max, bleurt_acc, bleurt_diff, bleu_max, bleu_acc, bleu_diff, rouge1_max, rouge1_acc, rouge1_diff, rouge2_max, rouge2_acc, rouge2_diff, rougeL_max, rougeL_acc, rougeL_diff|\n",
+        "|truthfulqa_mc                                            |     |✓  |    |          817|mc1, mc2                                                                                                                                                                         |\n",
+        "|webqs                                                    |✓    |   |✓   |         2032|acc                                                                                                                                                                              |\n",
+        "|wic                                                      |✓    |✓  |    |          638|acc                                                                                                                                                                              |\n",
+        "|wikitext                                                 |✓    |✓  |✓   |           62|word_perplexity, byte_perplexity, bits_per_byte                                                                                                                                  |\n",
+        "|winogrande                                               |✓    |✓  |    |         1267|acc                                                                                                                                                                              |\n",
+        "|wmt14-en-fr                                              |     |   |✓   |         3003|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt14-fr-en                                              |     |   |✓   |         3003|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt16-de-en                                              |     |   |✓   |         2999|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt16-en-de                                              |     |   |✓   |         2999|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt16-en-ro                                              |     |   |✓   |         1999|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt16-ro-en                                              |     |   |✓   |         1999|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-cs-en                                              |     |   |✓   |          664|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-de-en                                              |     |   |✓   |          785|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-de-fr                                              |     |   |✓   |         1619|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-en-cs                                              |     |   |✓   |         1418|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-en-de                                              |     |   |✓   |         1418|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-en-iu                                              |     |   |✓   |         2971|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-en-ja                                              |     |   |✓   |         1000|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-en-km                                              |     |   |✓   |         2320|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-en-pl                                              |     |   |✓   |         1000|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-en-ps                                              |     |   |✓   |         2719|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-en-ru                                              |     |   |✓   |         2002|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-en-ta                                              |     |   |✓   |         1000|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-en-zh                                              |     |   |✓   |         1418|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-fr-de                                              |     |   |✓   |         1619|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-iu-en                                              |     |   |✓   |         2971|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-ja-en                                              |     |   |✓   |          993|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-km-en                                              |     |   |✓   |         2320|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-pl-en                                              |     |   |✓   |         1001|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-ps-en                                              |     |   |✓   |         2719|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-ru-en                                              |     |   |✓   |          991|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-ta-en                                              |     |   |✓   |          997|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wmt20-zh-en                                              |     |   |✓   |         2000|bleu, chrf, ter                                                                                                                                                                  |\n",
+        "|wnli                                                     |✓    |✓  |    |           71|acc                                                                                                                                                                              |\n",
+        "|wsc                                                      |✓    |✓  |    |          104|acc                                                                                                                                                                              |\n",
+        "|wsc273                                                   |     |   |✓   |          273|acc                                                                                                                                                                              |\n",
+        "|xcopa_et                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xcopa_ht                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xcopa_id                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xcopa_it                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xcopa_qu                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xcopa_sw                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xcopa_ta                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xcopa_th                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xcopa_tr                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xcopa_vi                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xcopa_zh                                                 |     |✓  |✓   |          500|acc                                                                                                                                                                              |\n",
+        "|xnli_ar                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_bg                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_de                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_el                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_en                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_es                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_fr                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_hi                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_ru                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_sw                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_th                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_tr                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_ur                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_vi                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xnli_zh                                                  |✓    |✓  |✓   |         5010|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_ar                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_en                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_es                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_eu                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_hi                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_id                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_my                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_ru                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_sw                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_te                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xstory_cloze_zh                                          |✓    |✓  |    |         1511|acc                                                                                                                                                                              |\n",
+        "|xwinograd_en                                             |     |   |✓   |         2325|acc                                                                                                                                                                              |\n",
+        "|xwinograd_fr                                             |     |   |✓   |           83|acc                                                                                                                                                                              |\n",
+        "|xwinograd_jp                                             |     |   |✓   |          959|acc                                                                                                                                                                              |\n",
+        "|xwinograd_pt                                             |     |   |✓   |          263|acc                                                                                                                                                                              |\n",
+        "|xwinograd_ru                                             |     |   |✓   |          315|acc                                                                                                                                                                              |\n",
+        "|xwinograd_zh                                             |     |   |✓   |          504|acc                                                                                                                                                                              |\n",
+        "| Ceval-valid-computer_network                         |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-operating_system                         |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-computer_architecture                    |   | ✓ |   | 21 | acc |\n",
+        "| Ceval-valid-college_programming                      |   | ✓ |   | 37 | acc |\n",
+        "| Ceval-valid-college_physics                          |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-college_chemistry                        |   | ✓ |   | 24 | acc |\n",
+        "| Ceval-valid-advanced_mathematics                     |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-probability_and_statistics               |   | ✓ |   | 18 | acc |\n",
+        "| Ceval-valid-discrete_mathematics                     |   | ✓ |   | 16 | acc |\n",
+        "| Ceval-valid-electrical_engineer                      |   | ✓ |   | 37 | acc |\n",
+        "| Ceval-valid-metrology_engineer                       |   | ✓ |   | 24 | acc |\n",
+        "| Ceval-valid-high_school_mathematics                  |   | ✓ |   | 18 | acc |\n",
+        "| Ceval-valid-high_school_physics                      |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-high_school_chemistry                    |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-high_school_biology                      |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-middle_school_mathematics                |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-middle_school_biology                    |   | ✓ |   | 21 | acc |\n",
+        "| Ceval-valid-middle_school_physics                    |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-middle_school_chemistry                  |   | ✓ |   | 20 | acc |\n",
+        "| Ceval-valid-veterinary_medicine                      |   | ✓ |   | 23 | acc |\n",
+        "| Ceval-valid-college_economics                        |   | ✓ |   | 55 | acc |\n",
+        "| Ceval-valid-business_administration                  |   | ✓ |   | 33 | acc |\n",
+        "| Ceval-valid-marxism                                  |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-mao_zedong_thought                       |   | ✓ |   | 24 | acc |\n",
+        "| Ceval-valid-education_science                        |   | ✓ |   | 29 | acc |\n",
+        "| Ceval-valid-teacher_qualification                    |   | ✓ |   | 44 | acc |\n",
+        "| Ceval-valid-high_school_politics                     |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-high_school_geography                    |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-middle_school_politics                   |   | ✓ |   | 21 | acc |\n",
+        "| Ceval-valid-middle_school_geography                  |   | ✓ |   | 12 | acc |\n",
+        "| Ceval-valid-modern_chinese_history                   |   | ✓ |   | 23 | acc |\n",
+        "| Ceval-valid-ideological_and_moral_cultivation        |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-logic                                    |   | ✓ |   | 22 | acc |\n",
+        "| Ceval-valid-law                                      |   | ✓ |   | 24 | acc |\n",
+        "| Ceval-valid-chinese_language_and_literature          |   | ✓ |   | 23 | acc |\n",
+        "| Ceval-valid-art_studies                              |   | ✓ |   | 33 | acc |\n",
+        "| Ceval-valid-professional_tour_guide                  |   | ✓ |   | 29 | acc |\n",
+        "| Ceval-valid-legal_professional                       |   | ✓ |   | 23 | acc |\n",
+        "| Ceval-valid-high_school_chinese                      |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-high_school_history                      |   | ✓ |   | 20 | acc |\n",
+        "| Ceval-valid-middle_school_history                    |   | ✓ |   | 22 | acc |\n",
+        "| Ceval-valid-civil_servant                            |   | ✓ |   | 47 | acc |\n",
+        "| Ceval-valid-sports_science                           |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-plant_protection                         |   | ✓ |   | 22 | acc |\n",
+        "| Ceval-valid-basic_medicine                           |   | ✓ |   | 19 | acc |\n",
+        "| Ceval-valid-clinical_medicine                        |   | ✓ |   | 22 | acc |\n",
+        "| Ceval-valid-urban_and_rural_planner                  |   | ✓ |   | 46 | acc |\n",
+        "| Ceval-valid-accountant                               |   | ✓ |   | 49 | acc |\n",
+        "| Ceval-valid-fire_engineer                            |   | ✓ |   | 31 | acc |\n",
+        "| Ceval-valid-environmental_impact_assessment_engineer |   | ✓ |   | 31 | acc |\n",
+        "| Ceval-valid-tax_accountant                           |   | ✓ |   | 49 | acc |\n",
+        "| Ceval-valid-physician                                |   | ✓ |   | 49 | acc |"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}