XS-dev commited on
Commit
5657307
1 Parent(s): b03ed9a
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +45 -0
  2. bert-base-uncased/config.json +23 -0
  3. bert-base-uncased/pytorch_model.bin +3 -0
  4. bert-base-uncased/tokenizer.json +0 -0
  5. bert-base-uncased/tokenizer_config.json +1 -0
  6. bert-base-uncased/vocab.txt +0 -0
  7. data/test.csv +0 -0
  8. data/train.csv +0 -0
  9. data/val.csv +0 -0
  10. evaluate/.github/hub/push_evaluations_to_hub.py +118 -0
  11. evaluate/.github/hub/requirements.txt +1 -0
  12. evaluate/.github/workflows/build_documentation.yml +15 -0
  13. evaluate/.github/workflows/build_pr_documentation.yml +16 -0
  14. evaluate/.github/workflows/ci.yml +64 -0
  15. evaluate/.github/workflows/delete_doc_comment.yml +13 -0
  16. evaluate/.github/workflows/python-release.yml +31 -0
  17. evaluate/.github/workflows/update_spaces.yml +36 -0
  18. evaluate/.gitignore +64 -0
  19. evaluate/AUTHORS +8 -0
  20. evaluate/CODE_OF_CONDUCT.md +132 -0
  21. evaluate/CONTRIBUTING.md +277 -0
  22. evaluate/LICENSE +202 -0
  23. evaluate/Makefile +19 -0
  24. evaluate/README.md +78 -0
  25. evaluate/additional-tests-requirements.txt +6 -0
  26. evaluate/comparisons/exact_match/README.md +61 -0
  27. evaluate/comparisons/exact_match/app.py +6 -0
  28. evaluate/comparisons/exact_match/exact_match.py +65 -0
  29. evaluate/comparisons/exact_match/requirements.txt +2 -0
  30. evaluate/comparisons/mcnemar/README.md +86 -0
  31. evaluate/comparisons/mcnemar/app.py +6 -0
  32. evaluate/comparisons/mcnemar/mcnemar.py +98 -0
  33. evaluate/comparisons/mcnemar/requirements.txt +2 -0
  34. evaluate/comparisons/wilcoxon/README.md +70 -0
  35. evaluate/comparisons/wilcoxon/app.py +6 -0
  36. evaluate/comparisons/wilcoxon/requirements.txt +3 -0
  37. evaluate/comparisons/wilcoxon/wilcoxon.py +78 -0
  38. evaluate/docs/README.md +285 -0
  39. evaluate/docs/source/_toctree.yml +52 -0
  40. evaluate/docs/source/a_quick_tour.mdx +380 -0
  41. evaluate/docs/source/base_evaluator.mdx +294 -0
  42. evaluate/docs/source/choosing_a_metric.mdx +64 -0
  43. evaluate/docs/source/considerations.mdx +88 -0
  44. evaluate/docs/source/creating_and_sharing.mdx +113 -0
  45. evaluate/docs/source/custom_evaluator.mdx +114 -0
  46. evaluate/docs/source/evaluation_suite.mdx +74 -0
  47. evaluate/docs/source/index.mdx +34 -0
  48. evaluate/docs/source/installation.mdx +68 -0
  49. evaluate/docs/source/keras_integrations.md +113 -0
  50. evaluate/docs/source/package_reference/evaluator_classes.mdx +63 -0
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ import gradio as gr
3
+ import requests
4
+ import torch
5
+ from transformers import (
6
+ AutoConfig,
7
+ AutoModelForSequenceClassification,
8
+ AutoTokenizer,
9
+ )
10
+
11
+ model_dir = "my-bert-model"
12
+
13
+ config = AutoConfig.from_pretrained(model_dir, num_labels=3, finetuning_task="text-classification")
14
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
15
+ model = AutoModelForSequenceClassification.from_pretrained(model_dir, config=config)
16
+
17
+ def inference(input_text):
18
+ inputs = tokenizer.batch_encode_plus(
19
+ [input_text],
20
+ max_length=512,
21
+ pad_to_max_length=True,
22
+ truncation=True,
23
+ padding="max_length",
24
+ return_tensors="pt",
25
+ )
26
+
27
+ with torch.no_grad():
28
+ logits = model(**inputs).logits
29
+
30
+ predicted_class_id = logits.argmax().item()
31
+ output = model.config.id2label[predicted_class_id]
32
+ return output
33
+
34
+ demo = gr.Interface(
35
+ fn=inference,
36
+ inputs=gr.Textbox(label="Input Text", scale=2, container=False),
37
+ outputs=gr.Textbox(label="Output Label"),
38
+ examples = [
39
+ ["My last two weather pics from the storm on August 2nd. People packed up real fast after the temp dropped and winds picked up.", 1],
40
+ ["Lying Clinton sinking! Donald Trump singing: Let's Make America Great Again!", 0],
41
+ ],
42
+ title="Tutorial: BERT-based Text Classificatioin",
43
+ )
44
+
45
+ demo.launch(debug=True)
bert-base-uncased/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "gradient_checkpointing": false,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 512,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 12,
17
+ "pad_token_id": 0,
18
+ "position_embedding_type": "absolute",
19
+ "transformers_version": "4.6.0.dev0",
20
+ "type_vocab_size": 2,
21
+ "use_cache": true,
22
+ "vocab_size": 30522
23
+ }
bert-base-uncased/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:097417381d6c7230bd9e3557456d726de6e83245ec8b24f529f60198a67b203a
3
+ size 440473133
bert-base-uncased/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
bert-base-uncased/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "model_max_length": 512}
bert-base-uncased/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/test.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/train.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/val.csv ADDED
The diff for this file is too large to render. See raw diff
 
evaluate/.github/hub/push_evaluations_to_hub.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from huggingface_hub import create_repo, Repository
3
+ import tempfile
4
+ import subprocess
5
+ import os
6
+ import shutil
7
+ import logging
8
+ import re
9
+ from urllib.parse import urlparse
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ GIT_UP_TO_DATE = "On branch main\nYour branch is up to date with 'origin/main'.\
14
+ \n\nnothing to commit, working tree clean\n"
15
+
16
+ COMMIT_PLACEHOLDER = "{COMMIT_PLACEHOLDER}"
17
+
18
+ def get_git_tag(lib_path, commit_hash):
19
+ # check if commit has a tag, see: https://stackoverflow.com/questions/1474115/how-to-find-the-tag-associated-with-a-given-git-commit
20
+ command = f"git describe --exact-match {commit_hash}"
21
+ output = subprocess.run(command.split(),
22
+ stderr=subprocess.PIPE,
23
+ stdout=subprocess.PIPE,
24
+ encoding="utf-8",
25
+ cwd=lib_path,
26
+ env=os.environ.copy(),
27
+ )
28
+ tag = output.stdout.strip()
29
+ if re.match(r"v\d*\.\d*\.\d*", tag) is not None:
30
+ return tag
31
+ else:
32
+ return None
33
+
34
+
35
+ def copy_recursive(source_base_path, target_base_path):
36
+ """Copy directory recursively and overwrite existing files."""
37
+ for item in source_base_path.iterdir():
38
+ target_path = target_base_path / item.name
39
+ if item.is_dir():
40
+ target_path.mkdir(exist_ok=True)
41
+ copy_recursive(item, target_path)
42
+ else:
43
+ shutil.copy(item, target_path)
44
+
45
+ def update_evaluate_dependency(requirements_path, commit_hash):
46
+ """Updates the evaluate requirement with the latest commit."""
47
+ with open(requirements_path, "r") as f:
48
+ file_content = f.read()
49
+ file_content = file_content.replace(COMMIT_PLACEHOLDER, commit_hash)
50
+ with open(requirements_path, "w") as f:
51
+ f.write(file_content)
52
+
53
+ def push_module_to_hub(module_path, type, token, commit_hash, tag=None):
54
+ module_name = module_path.stem
55
+ org = f"evaluate-{type}"
56
+
57
+ repo_url = create_repo(org + "/" + module_name, repo_type="space", space_sdk="gradio", exist_ok=True, token=token)
58
+ repo_path = Path(tempfile.mkdtemp())
59
+
60
+ scheme = urlparse(repo_url).scheme
61
+ repo_url = repo_url.replace(f"{scheme}://", f"{scheme}://user:{token}@")
62
+ clean_repo_url = re.sub(r"(https?)://.*@", r"\1://", repo_url)
63
+
64
+ try:
65
+ subprocess.run(
66
+ f"git clone {repo_url}".split(),
67
+ stderr=subprocess.PIPE,
68
+ stdout=subprocess.PIPE,
69
+ check=True,
70
+ encoding="utf-8",
71
+ cwd=repo_path,
72
+ env=os.environ.copy(),
73
+ )
74
+ except OSError:
75
+ # make sure we don't accidentally expose token
76
+ raise OSError(f"Could not clone from '{clean_repo_url}'")
77
+
78
+ repo = Repository(local_dir=repo_path / module_name, use_auth_token=token)
79
+
80
+ copy_recursive(module_path, repo_path / module_name)
81
+ update_evaluate_dependency(repo_path / module_name / "requirements.txt", commit_hash)
82
+
83
+ repo.git_add()
84
+ try:
85
+ repo.git_commit(f"Update Space (evaluate main: {commit_hash[:8]})")
86
+ repo.git_push()
87
+ logger.info(f"Module '{module_name}' pushed to the hub")
88
+ except OSError as error:
89
+ if str(error) == GIT_UP_TO_DATE:
90
+ logger.info(f"Module '{module_name}' is already up to date.")
91
+ else:
92
+ raise error
93
+
94
+ if tag is not None:
95
+ repo.add_tag(tag, message="add evaluate tag", remote="origin")
96
+
97
+ shutil.rmtree(repo_path)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ evaluation_paths = ["metrics", "comparisons", "measurements"]
102
+ evaluation_types = ["metric", "comparison", "measurement"]
103
+
104
+ token = os.getenv("HF_TOKEN")
105
+ evaluate_lib_path = Path(os.getenv("EVALUATE_LIB_PATH"))
106
+ commit_hash = os.getenv("GIT_HASH")
107
+ git_tag = get_git_tag(evaluate_lib_path, commit_hash)
108
+ if git_tag is not None:
109
+ logger.info(f"Found tag: {git_tag}.")
110
+
111
+ for type, dir in zip(evaluation_types, evaluation_paths):
112
+ if (evaluate_lib_path/dir).exists():
113
+ for module_path in (evaluate_lib_path/dir).iterdir():
114
+ if module_path.is_dir():
115
+ logger.info(f"Updating: module {module_path.name}.")
116
+ push_module_to_hub(module_path, type, token, commit_hash, tag=git_tag)
117
+ else:
118
+ logger.warning(f"No folder {str(evaluate_lib_path/dir)} for {type} found.")
evaluate/.github/hub/requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ huggingface_hub
evaluate/.github/workflows/build_documentation.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build documentation
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ - doc-builder*
8
+ - v*-release
9
+
10
+ jobs:
11
+ build:
12
+ uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
13
+ with:
14
+ commit_sha: ${{ github.sha }}
15
+ package: evaluate
evaluate/.github/workflows/build_pr_documentation.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build PR Documentation
2
+
3
+ on:
4
+ pull_request:
5
+
6
+ concurrency:
7
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
8
+ cancel-in-progress: true
9
+
10
+ jobs:
11
+ build:
12
+ uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13
+ with:
14
+ commit_sha: ${{ github.event.pull_request.head.sha }}
15
+ pr_number: ${{ github.event.number }}
16
+ package: evaluate
evaluate/.github/workflows/ci.yml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ branches:
6
+ - main
7
+ push:
8
+ branches:
9
+ - main
10
+ - ci-*
11
+
12
+ env:
13
+ HF_ALLOW_CODE_EVAL: 1
14
+
15
+ jobs:
16
+
17
+ check_code_quality:
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - uses: actions/checkout@v3
21
+ - name: Set up Python
22
+ uses: actions/setup-python@v4
23
+ with:
24
+ python-version: "3.7"
25
+ - name: Install dependencies
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ pip install .[quality]
29
+ - name: Check quality
30
+ run: |
31
+ black --check --line-length 119 --target-version py36 tests src metrics comparisons measurements
32
+ isort --check-only tests src metrics comparisons measurements
33
+ flake8 tests src metrics
34
+
35
+ test:
36
+ needs: check_code_quality
37
+ strategy:
38
+ fail-fast: false
39
+ matrix:
40
+ test: ['unit', 'parity']
41
+ os: [ubuntu-latest, windows-latest]
42
+ runs-on: ${{ matrix.os }}
43
+ steps:
44
+ - uses: actions/checkout@v3
45
+ with:
46
+ fetch-depth: 0
47
+ - name: Set up Python 3.7
48
+ uses: actions/setup-python@v4
49
+ with:
50
+ python-version: "3.7"
51
+ - name: Upgrade pip
52
+ run: python -m pip install --upgrade pip
53
+ - name: Install dependencies
54
+ run: |
55
+ pip install .[tests]
56
+ pip install -r additional-tests-requirements.txt --no-deps
57
+ - name: Test with pytest
58
+ if: ${{ matrix.test == 'unit' }}
59
+ run: |
60
+ python -m pytest -n 2 --dist loadfile -sv ./tests/ --ignore=./tests/test_trainer_evaluator_parity.py
61
+ - name: Integration test with transformers
62
+ if: ${{ matrix.test == 'parity' }}
63
+ run: |
64
+ python -m pytest -n 2 --dist loadfile -sv ./tests/test_trainer_evaluator_parity.py
evaluate/.github/workflows/delete_doc_comment.yml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Delete dev documentation
2
+
3
+ on:
4
+ pull_request:
5
+ types: [ closed ]
6
+
7
+
8
+ jobs:
9
+ delete:
10
+ uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
11
+ with:
12
+ pr_number: ${{ github.event.number }}
13
+ package: evaluate
evaluate/.github/workflows/python-release.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Python release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - v*
7
+
8
+ env:
9
+ PYPI_TOKEN: ${{ secrets.PYPI_TOKEN_DIST }}
10
+
11
+ jobs:
12
+ python_release:
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - uses: actions/checkout@v2
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v2
19
+ with:
20
+ python-version: 3.9
21
+ - name: Install dependencies
22
+ run: |
23
+ pip install --upgrade pip
24
+ pip install setuptools wheel
25
+ - run: python setup.py sdist bdist_wheel
26
+
27
+ - run: |
28
+ pip install twine
29
+ - name: Upload to PyPi
30
+ run: |
31
+ twine upload dist/* -u __token__ -p "$PYPI_TOKEN"
evaluate/.github/workflows/update_spaces.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Update Hub repositories
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ update-hub-repositories:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Checkout repository
13
+ uses: actions/checkout@v2
14
+ with:
15
+ fetch-depth: 0
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: "3.7"
20
+ - name: Set up default Git config
21
+ run: |
22
+ git config --global user.name evaluate-bot
23
+ git config --global user.email [email protected]
24
+ - name: Install dependencies
25
+ working-directory: ./.github/hub
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ pip install -r requirements.txt
29
+ - name: Update Hub repositories
30
+ working-directory: ./.github/hub
31
+ run: |
32
+ export HF_TOKEN=${{ secrets.HF_HUB_TOKEN }}
33
+ export EVALUATE_LIB_PATH=$GITHUB_WORKSPACE
34
+ export GIT_HASH=$GITHUB_SHA
35
+ export GIT_LFS_SKIP_SMUDGE=1
36
+ python push_evaluations_to_hub.py
evaluate/.gitignore ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Locked files
2
+ *.lock
3
+ !dvc.lock
4
+
5
+ # Extracted dummy data
6
+ datasets/**/dummy_data-zip-extracted/
7
+
8
+ # Compiled python modules.
9
+ *.pyc
10
+
11
+ # Byte-compiled
12
+ _pycache__/
13
+ .cache/
14
+
15
+ # Python egg metadata, regenerated from source files by setuptools.
16
+ *.egg-info
17
+ .eggs/
18
+
19
+ # PyPI distribution artifacts.
20
+ build/
21
+ dist/
22
+
23
+ # Environments
24
+ .env
25
+ .venv
26
+ env/
27
+ venv/
28
+ ENV/
29
+ env.bak/
30
+ venv.bak/
31
+
32
+ # pyenv
33
+ .python-version
34
+
35
+ # Tests
36
+ .pytest_cache/
37
+
38
+ # Other
39
+ *.DS_Store
40
+
41
+ # PyCharm/vscode
42
+ .idea
43
+ .vscode
44
+
45
+ # keep only the empty datasets and metrics directory with it's __init__.py file
46
+ /src/*/datasets/*
47
+ !/src/*/datasets/__init__.py
48
+
49
+ /src/*/metrics/*
50
+ !/src/*/metrics/__init__.py
51
+
52
+ # Vim
53
+ .*.swp
54
+
55
+ # playground
56
+ /playground
57
+
58
+ # Sphinx documentation
59
+ docs/_build/
60
+ docs/source/_build/
61
+
62
+ # Benchmark results
63
+ report.json
64
+ report.md
evaluate/AUTHORS ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # This is the list of HuggingFace Datasets authors for copyright purposes.
2
+ #
3
+ # This does not necessarily list everyone who has contributed code, since in
4
+ # some cases, their employer may be the copyright holder. To see the full list
5
+ # of contributors, see the revision history in source control.
6
+
7
+ Google Inc.
8
+ HuggingFace Inc.
evaluate/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, caste, color, religion, or sexual identity
10
+ and orientation.
11
+
12
+ We pledge to act and interact in ways that contribute to an open, welcoming,
13
+ diverse, inclusive, and healthy community.
14
+
15
+ ## Our Standards
16
+
17
+ Examples of behavior that contributes to a positive environment for our
18
+ community include:
19
+
20
+ * Demonstrating empathy and kindness toward other people
21
+ * Being respectful of differing opinions, viewpoints, and experiences
22
+ * Giving and gracefully accepting constructive feedback
23
+ * Accepting responsibility and apologizing to those affected by our mistakes,
24
+ and learning from the experience
25
+ * Focusing on what is best not just for us as individuals, but for the
26
+ overall community
27
+
28
+ Examples of unacceptable behavior include:
29
+
30
+ * The use of sexualized language or imagery, and sexual attention or
31
+ advances of any kind
32
+ * Trolling, insulting or derogatory comments, and personal or political attacks
33
+ * Public or private harassment
34
+ * Publishing others' private information, such as a physical or email
35
+ address, without their explicit permission
36
+ * Other conduct which could reasonably be considered inappropriate in a
37
+ professional setting
38
+
39
+ ## Enforcement Responsibilities
40
+
41
+ Community leaders are responsible for clarifying and enforcing our standards of
42
+ acceptable behavior and will take appropriate and fair corrective action in
43
+ response to any behavior that they deem inappropriate, threatening, offensive,
44
+ or harmful.
45
+
46
+ Community leaders have the right and responsibility to remove, edit, or reject
47
+ comments, commits, code, wiki edits, issues, and other contributions that are
48
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
49
+ decisions when appropriate.
50
+
51
+ ## Scope
52
+
53
+ This Code of Conduct applies within all community spaces, and also applies when
54
+ an individual is officially representing the community in public spaces.
55
+ Examples of representing our community include using an official e-mail address,
56
+ posting via an official social media account, or acting as an appointed
57
+ representative at an online or offline event.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported to the community leaders responsible for enforcement at
63
64
+ All complaints will be reviewed and investigated promptly and fairly.
65
+
66
+ All community leaders are obligated to respect the privacy and security of the
67
+ reporter of any incident.
68
+
69
+ ## Enforcement Guidelines
70
+
71
+ Community leaders will follow these Community Impact Guidelines in determining
72
+ the consequences for any action they deem in violation of this Code of Conduct:
73
+
74
+ ### 1. Correction
75
+
76
+ **Community Impact**: Use of inappropriate language or other behavior deemed
77
+ unprofessional or unwelcome in the community.
78
+
79
+ **Consequence**: A private, written warning from community leaders, providing
80
+ clarity around the nature of the violation and an explanation of why the
81
+ behavior was inappropriate. A public apology may be requested.
82
+
83
+ ### 2. Warning
84
+
85
+ **Community Impact**: A violation through a single incident or series
86
+ of actions.
87
+
88
+ **Consequence**: A warning with consequences for continued behavior. No
89
+ interaction with the people involved, including unsolicited interaction with
90
+ those enforcing the Code of Conduct, for a specified period of time. This
91
+ includes avoiding interactions in community spaces as well as external channels
92
+ like social media. Violating these terms may lead to a temporary or
93
+ permanent ban.
94
+
95
+ ### 3. Temporary Ban
96
+
97
+ **Community Impact**: A serious violation of community standards, including
98
+ sustained inappropriate behavior.
99
+
100
+ **Consequence**: A temporary ban from any sort of interaction or public
101
+ communication with the community for a specified period of time. No public or
102
+ private interaction with the people involved, including unsolicited interaction
103
+ with those enforcing the Code of Conduct, is allowed during this period.
104
+ Violating these terms may lead to a permanent ban.
105
+
106
+ ### 4. Permanent Ban
107
+
108
+ **Community Impact**: Demonstrating a pattern of violation of community
109
+ standards, including sustained inappropriate behavior, harassment of an
110
+ individual, or aggression toward or disparagement of classes of individuals.
111
+
112
+ **Consequence**: A permanent ban from any sort of public interaction within
113
+ the community.
114
+
115
+ ## Attribution
116
+
117
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118
+ version 2.0, available at
119
+ [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
120
+
121
+ Community Impact Guidelines were inspired by
122
+ [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123
+
124
+ For answers to common questions about this code of conduct, see the FAQ at
125
+ [https://www.contributor-covenant.org/faq][FAQ]. Translations are available
126
+ at [https://www.contributor-covenant.org/translations][translations].
127
+
128
+ [homepage]: https://www.contributor-covenant.org
129
+ [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
130
+ [Mozilla CoC]: https://github.com/mozilla/diversity
131
+ [FAQ]: https://www.contributor-covenant.org/faq
132
+ [translations]: https://www.contributor-covenant.org/translations
evaluate/CONTRIBUTING.md ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ # How to contribute to Evaluate
18
+
19
+ Everyone is welcome to contribute, and we value everybody's contribution. Code
20
+ is not the only way to help the community. Answering questions, helping
21
+ others, reaching out and improving the documentations are immensely valuable to
22
+ the community.
23
+
24
+ It also helps us if you spread the word: reference the library from blog posts
25
+ on the awesome projects it made possible, shout out on Twitter every time it has
26
+ helped you, or simply star the repo to say "thank you".
27
+
28
+ Whichever way you choose to contribute, please be mindful to respect our
29
+ [code of conduct](https://github.com/huggingface/evaluate/blob/main/CODE_OF_CONDUCT.md).
30
+
31
+ ## You can contribute in so many ways!
32
+
33
+ There are four ways you can contribute to `evaluate`:
34
+ * Fixing outstanding issues with the existing code;
35
+ * Implementing new evaluators and metrics;
36
+ * Contributing to the examples and documentation;
37
+ * Submitting issues related to bugs or desired new features.
38
+
39
+ Open issues are tracked directly on the repository [here](https://github.com/huggingface/evaluate/issues).
40
+
41
+ If you would like to work on any of the open issues:
42
+ * Make sure it is not already assigned to someone else. The assignee (if any) is on the top right column of the Issue page. If it's not taken, self-assign it.
43
+ * Work on your self-assigned issue and create a Pull Request!
44
+
45
+ ## Submitting a new issue or feature request
46
+
47
+ Following these guidelines when submitting an issue or a feature
48
+ request will make it easier for us to come back to you quickly and with good
49
+ feedback.
50
+
51
+ ### Do you want to implement a new metric?
52
+
53
+ All evaluation modules, be it metrics, comparisons, or measurements live on the 🤗 Hub in a [Space](https://huggingface.co/docs/hub/spaces) (see for example [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)). Evaluation modules can be either **community** or **canonical**.
54
+
55
+ * **Canonical** metrics are well-established metrics which already broadly adopted.
56
+ * **Community** metrics are new or custom metrics. It is simple to add a new community metric to use with `evaluate`. Please see our guide to adding a new evaluation metric [here](https://huggingface.co/docs/evaluate/creating_and_sharing)!
57
+
58
+ The only functional difference is that canonical metrics are integrated into the `evaluate` library directly and do not require a namespace when being loaded.
59
+
60
+ We encourage contributors to share new evaluation modules they contribute broadly! If they become widely adopted then they will be integrated into the core `evaluate` library as a canonical module.
61
+
62
+ ### Do you want to request a new feature (that is not a metric)?
63
+
64
+ We would appreciate it if your feature request addresses the following points:
65
+
66
+ 1. Motivation first:
67
+ * Is it related to a problem/frustration with the library? If so, please explain
68
+ why. Providing a code snippet that demonstrates the problem is best.
69
+ * Is it related to something you would need for a project? We'd love to hear
70
+ about it!
71
+ * Is it something you worked on and think could benefit the community?
72
+ Awesome! Tell us what problem it solved for you.
73
+ 2. Write a *full paragraph* describing the feature;
74
+ 3. Provide a **code snippet** that demonstrates its future use;
75
+ 4. In case this is related to a paper, please attach a link;
76
+ 5. Attach any additional information (drawings, screenshots, etc.) you think may help.
77
+
78
+ ### Did you find a bug?
79
+
80
+ Thank you for reporting an issue. If the bug is related to a community metric, please open an issue or pull request directly on the repository of the metric on the Hugging Face Hub.
81
+
82
+ If the bug is related to the `evaluate` library and not a community metric, we would really appreciate it if you could **make sure the bug was not already reported** (use the search bar on Github under Issues). If it's not already logged, please open an issue with these details:
83
+
84
+ * Include your **OS type and version**, the versions of **Python**, **PyTorch** and
85
+ **Tensorflow** when applicable;
86
+ * A short, self-contained, code snippet that allows us to reproduce the bug in
87
+ less than 30s;
88
+ * Provide the *full* traceback if an exception is raised.
89
+
90
+ ## Start contributing! (Pull Requests)
91
+
92
+ Before writing code, we strongly advise you to search through the existing PRs or
93
+ issues to make sure that nobody is already working on the same thing. If you are
94
+ unsure, it is always a good idea to open an issue to get some feedback.
95
+
96
+ 1. Fork the [repository](https://github.com/huggingface/evaluate) by
97
+ clicking on the 'Fork' button on the repository's page. This creates a copy of the code
98
+ under your GitHub user account.
99
+
100
+ 2. Clone your fork to your local disk, and add the base repository as a remote:
101
+
102
+ ```bash
103
+ $ git clone [email protected]:<your Github handle>/evaluate.git
104
+ $ cd evaluate
105
+ $ git remote add upstream https://github.com/huggingface/evaluate.git
106
+ ```
107
+
108
+ 3. Create a new branch to hold your development changes:
109
+
110
+ ```bash
111
+ $ git checkout -b a-descriptive-name-for-my-changes
112
+ ```
113
+
114
+ **Do not** work on the `main` branch.
115
+
116
+ 4. Set up a development environment by running the following command in a virtual environment:
117
+
118
+ ```bash
119
+ $ pip install -e ".[dev]"
120
+ ```
121
+
122
+ 5. Develop the features on your branch.
123
+
124
+ As you work on the features, you should make sure that the test suite
125
+ passes. You should run the tests impacted by your changes like this:
126
+
127
+ ```bash
128
+ $ pytest tests/<TEST_TO_RUN>.py
129
+ ```
130
+
131
+ To run a specific test, for example the `test_model_init` test in test_evaluator.py,
132
+
133
+ ```bash
134
+ python -m pytest ./tests/test_evaluator.py::TestQuestionAnsweringEvaluator::test_model_init
135
+ ```
136
+
137
+ You can also run the full suite with the following command:
138
+
139
+ ```bash
140
+ $ python -m pytest ./tests/
141
+ ```
142
+
143
+ 🤗 Evaluate relies on `black` and `isort` to format its source code
144
+ consistently. After you make changes, apply automatic style corrections and code verifications
145
+ that can't be automated in one go with:
146
+
147
+ ```bash
148
+ $ make quality
149
+ ```
150
+
151
+ This target is also optimized to only work with files modified by the PR you're working on.
152
+
153
+ If you prefer to run the checks one after the other, the following command apply the
154
+ style corrections:
155
+
156
+ ```bash
157
+ $ make style
158
+ ```
159
+
160
+ 🤗 Evaluate also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
161
+ control runs in CI, however you can also run the same checks with:
162
+
163
+ ```bash
164
+ $ make quality
165
+ ```
166
+
167
+ If you're modifying documents under `docs/source`, make sure to validate that
168
+ they can still be built. This check also runs in CI. To run a local check
169
+ make sure you have installed the documentation builder requirements. First you will need to clone the
170
+ repository containing our tools to build the documentation:
171
+
172
+ ```bash
173
+ $ pip install git+https://github.com/huggingface/doc-builder
174
+ ```
175
+
176
+ Then, make sure you have all the dependencies to be able to build the doc with:
177
+
178
+ ```bash
179
+ $ pip install ".[docs]"
180
+ ```
181
+
182
+ Finally, run the following command from the root of the repository:
183
+
184
+ ```bash
185
+ $ doc-builder build evaluate docs/source/ --build_dir ~/tmp/test-build
186
+ ```
187
+
188
+ This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
189
+ Markdown files with your favorite editor. You won't be able to see the final rendering on the website
190
+ before your PR is merged, we are actively working on adding a tool for this.
191
+
192
+ Once you're happy with your changes, add changed files using `git add` and
193
+ make a commit with `git commit` to record your changes locally:
194
+
195
+ ```bash
196
+ $ git add modified_file.py
197
+ $ git commit
198
+ ```
199
+
200
+ Please write [good commit
201
+ messages](https://chris.beams.io/posts/git-commit/).
202
+
203
+ It is a good idea to sync your copy of the code with the original
204
+ repository regularly. This way you can quickly account for changes:
205
+
206
+ ```bash
207
+ $ git fetch upstream
208
+ $ git rebase upstream/main
209
+ ```
210
+
211
+ Push the changes to your account using:
212
+
213
+ ```bash
214
+ $ git push -u origin a-descriptive-name-for-my-changes
215
+ ```
216
+
217
+ 6. Once you are satisfied, go to the webpage of your fork on GitHub. Click on 'Pull request' to send your changes
218
+ to the project maintainers for review.
219
+
220
+ 7. It's ok if maintainers ask you for changes. It happens to core contributors
221
+ too! So everyone can see the changes in the Pull request, work in your local
222
+ branch and push the changes to your fork. They will automatically appear in
223
+ the pull request.
224
+
225
+
226
+ ### Checklist
227
+
228
+ 1. The title of your pull request should be a summary of its contribution;
229
+ 2. If your pull request addresses an issue, please mention the issue number in
230
+ the pull request description to make sure they are linked (and people
231
+ consulting the issue know you are working on it);
232
+ 3. To indicate a work in progress please prefix the title with `[WIP]`. These
233
+ are useful to avoid duplicated work, and to differentiate it from PRs ready
234
+ to be merged;
235
+ 4. Make sure existing tests pass;
236
+ 5. Add high-coverage tests. No quality testing = no merge.
237
+ 6. All public methods must have informative docstrings that work nicely with sphinx.
238
+ 7. Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
239
+ the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
240
+ them by URL.
241
+
242
+
243
+ ### Style guide
244
+
245
+ For documentation strings, 🤗 Evaluate follows the [google style](https://google.github.io/styleguide/pyguide.html).
246
+ Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
247
+ for more information.
248
+
249
+ **This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
250
+
251
+ ### Develop on Windows
252
+
253
+ On Windows, you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
254
+
255
+ `git config core.autocrlf input`
256
+
257
+ One way one can run the make command on Window is to pass by MSYS2:
258
+
259
+ 1. [Download MSYS2](https://www.msys2.org/), we assume to have it installed in C:\msys64
260
+ 2. Open the command line C:\msys64\msys2.exe (it should be available from the start menu)
261
+ 3. Run in the shell: `pacman -Syu` and install make with `pacman -S make`
262
+ 4. Add `C:\msys64\usr\bin` to your PATH environment variable.
263
+
264
+ You can now use `make` from any terminal (Powershell, cmd.exe, etc) 🎉
265
+
266
+ ### Syncing forked main with upstream (HuggingFace) main
267
+
268
+ To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnecessary notifications to the developers involved in these PRs,
269
+ when syncing the main branch of a forked repository, please, follow these steps:
270
+ 1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead, merge directly into the forked main.
271
+ 2. If a PR is absolutely necessary, use the following steps after checking out your branch:
272
+ ```
273
+ $ git checkout -b your-branch-for-syncing
274
+ $ git pull --squash --no-commit upstream main
275
+ $ git commit -m '<your message without GitHub references>'
276
+ $ git push --set-upstream origin your-branch-for-syncing
277
+ ```
evaluate/LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright [yyyy] [name of copyright owner]
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
evaluate/Makefile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: quality style test
2
+
3
+ # Check that source code meets quality standards
4
+
5
+ quality:
6
+ black --check --line-length 119 --target-version py36 tests src metrics comparisons measurements
7
+ isort --check-only tests src metrics measurements
8
+ flake8 tests src metrics
9
+
10
+ # Format source code automatically
11
+
12
+ style:
13
+ black --line-length 119 --target-version py36 tests src metrics comparisons measurements
14
+ isort tests src metrics measurements
15
+
16
+ # Run tests for the library
17
+
18
+ test:
19
+ python -m pytest -n auto --dist=loadfile -s -v ./tests/
evaluate/README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <br>
3
+ <img src="https://huggingface.co/datasets/evaluate/media/resolve/main/evaluate-banner.png" width="400"/>
4
+ <br>
5
+ </p>
6
+
7
+ <p align="center">
8
+ <a href="https://github.com/huggingface/evaluate/actions/workflows/ci.yml?query=branch%3Amain">
9
+ <img alt="Build" src="https://github.com/huggingface/evaluate/actions/workflows/ci.yml/badge.svg?branch=main">
10
+ </a>
11
+ <a href="https://github.com/huggingface/evaluate/blob/master/LICENSE">
12
+ <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/evaluate.svg?color=blue">
13
+ </a>
14
+ <a href="https://huggingface.co/docs/evaluate/index">
15
+ <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/evaluate/index.svg?down_color=red&down_message=offline&up_message=online">
16
+ </a>
17
+ <a href="https://github.com/huggingface/evaluate/releases">
18
+ <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/evaluate.svg">
19
+ </a>
20
+ <a href="CODE_OF_CONDUCT.md">
21
+ <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg">
22
+ </a>
23
+ </p>
24
+
25
+ 🤗 Evaluate is a library that makes evaluating and comparing models and reporting their performance easier and more standardized.
26
+
27
+ It currently contains:
28
+
29
+ - **implementations of dozens of popular metrics**: the existing metrics cover a variety of tasks spanning from NLP to Computer Vision, and include dataset-specific metrics for datasets. With a simple command like `accuracy = load("accuracy")`, get any of these metrics ready to use for evaluating a ML model in any framework (Numpy/Pandas/PyTorch/TensorFlow/JAX).
30
+ - **comparisons and measurements**: comparisons are used to measure the difference between models and measurements are tools to evaluate datasets.
31
+ - **an easy way of adding new evaluation modules to the 🤗 Hub**: you can create new evaluation modules and push them to a dedicated Space in the 🤗 Hub with `evaluate-cli create [metric name]`, which allows you to see easily compare different metrics and their outputs for the same sets of references and predictions.
32
+
33
+ [🎓 **Documentation**](https://huggingface.co/docs/evaluate/)
34
+
35
+ 🔎 **Find a [metric](https://huggingface.co/evaluate-metric), [comparison](https://huggingface.co/evaluate-comparison), [measurement](https://huggingface.co/evaluate-measurement) on the Hub**
36
+
37
+ [🌟 **Add a new evaluation module**](https://huggingface.co/docs/evaluate/)
38
+
39
+ 🤗 Evaluate also has lots of useful features like:
40
+
41
+ - **Type checking**: the input types are checked to make sure that you are using the right input formats for each metric
42
+ - **Metric cards**: each metrics comes with a card that describes the values, limitations and their ranges, as well as providing examples of their usage and usefulness.
43
+ - **Community metrics:** Metrics live on the Hugging Face Hub and you can easily add your own metrics for your project or to collaborate with others.
44
+
45
+
46
+ # Installation
47
+
48
+ ## With pip
49
+
50
+ 🤗 Evaluate can be installed from PyPi and has to be installed in a virtual environment (venv or conda for instance)
51
+
52
+ ```bash
53
+ pip install evaluate
54
+ ```
55
+
56
+ # Usage
57
+
58
+ 🤗 Evaluate's main methods are:
59
+
60
+ - `evaluate.list_evaluation_modules()` to list the available metrics, comparisons and measurements
61
+ - `evaluate.load(module_name, **kwargs)` to instantiate an evaluation module
62
+ - `results = module.compute(*kwargs)` to compute the result of an evaluation module
63
+
64
+ # Adding a new evaluation module
65
+
66
+ First install the necessary dependencies to create a new metric with the following command:
67
+ ```bash
68
+ pip install evaluate[template]
69
+ ```
70
+ Then you can get started with the following command which will create a new folder for your metric and display the necessary steps:
71
+ ```bash
72
+ evaluate-cli create "Awesome Metric"
73
+ ```
74
+ See this [step-by-step guide](https://huggingface.co/docs/evaluate/creating_and_sharing) in the documentation for detailed instructions.
75
+
76
+ ## Credits
77
+
78
+ Thanks to [@marella](https://github.com/marella) for letting us use the `evaluate` namespace on PyPi previously used by his [library](https://github.com/marella/evaluate).
evaluate/additional-tests-requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ unbabel-comet>=1.0.0;python_version>'3.6'
2
+ git+https://github.com/google-research/bleurt.git
3
+ git+https://github.com/ns-moosavi/coval.git
4
+ git+https://github.com/hendrycks/math.git
5
+ git+https://github.com/google-research/rl-reliability-metrics
6
+ gin-config
evaluate/comparisons/exact_match/README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Exact Match
3
+ emoji: 🤗
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.0.2
8
+ app_file: app.py
9
+ pinned: false
10
+ tags:
11
+ - evaluate
12
+ - comparison
13
+ description: >-
14
+ Returns the rate at which the predictions of one model exactly match those of another model.
15
+ ---
16
+
17
+
18
+ # Comparison Card for Exact Match
19
+
20
+ ## Comparison description
21
+
22
+ Given two model predictions the exact match score is 1 if they are the exact same, and is 0 otherwise. The overall exact match score is the average.
23
+
24
+ - **Example 1**: The exact match score if prediction 1.0 is [0, 1] is 0, given prediction 2 is [0, 1].
25
+ - **Example 2**: The exact match score if prediction 0.0 is [0, 1] is 0, given prediction 2 is [1, 0].
26
+ - **Example 3**: The exact match score if prediction 0.5 is [0, 1] is 0, given prediction 2 is [1, 1].
27
+
28
+ ## How to use
29
+
30
+ At minimum, this metric takes as input predictions and references:
31
+ ```python
32
+ >>> exact_match = evaluate.load("exact_match", module_type="comparison")
33
+ >>> results = exact_match.compute(predictions1=[0, 1, 1], predictions2=[1, 1, 1])
34
+ >>> print(results)
35
+ {'exact_match': 0.66}
36
+ ```
37
+
38
+ ## Output values
39
+
40
+ Returns a float between 0.0 and 1.0 inclusive.
41
+
42
+ ## Examples
43
+
44
+ ```python
45
+ >>> exact_match = evaluate.load("exact_match", module_type="comparison")
46
+ >>> results = exact_match.compute(predictions1=[0, 0, 0], predictions2=[1, 1, 1])
47
+ >>> print(results)
48
+ {'exact_match': 1.0}
49
+ ```
50
+
51
+ ```python
52
+ >>> exact_match = evaluate.load("exact_match", module_type="comparison")
53
+ >>> results = exact_match.compute(predictions1=[0, 1, 1], predictions2=[1, 1, 1])
54
+ >>> print(results)
55
+ {'exact_match': 0.66}
56
+ ```
57
+
58
+
59
+ ## Limitations and bias
60
+
61
+ ## Citations
evaluate/comparisons/exact_match/app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("exact_match", module_type="comparison")
6
+ launch_gradio_widget(module)
evaluate/comparisons/exact_match/exact_match.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Evaluate Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Exact match test for model comparison."""
15
+
16
+ import datasets
17
+ import numpy as np
18
+
19
+ import evaluate
20
+
21
+
22
+ _DESCRIPTION = """
23
+ Returns the rate at which the predictions of one model exactly match those of another model.
24
+ """
25
+
26
+
27
+ _KWARGS_DESCRIPTION = """
28
+ Args:
29
+ predictions1 (`list` of `int`): Predicted labels for model 1.
30
+ predictions2 (`list` of `int`): Predicted labels for model 2.
31
+
32
+ Returns:
33
+ exact_match (`float`): Dictionary containing exact_match rate. Possible values are between 0.0 and 1.0, inclusive.
34
+
35
+ Examples:
36
+ >>> exact_match = evaluate.load("exact_match", module_type="comparison")
37
+ >>> results = exact_match.compute(predictions1=[1, 1, 1], predictions2=[1, 1, 1])
38
+ >>> print(results)
39
+ {'exact_match': 1.0}
40
+ """
41
+
42
+
43
+ _CITATION = """
44
+ """
45
+
46
+
47
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
48
+ class ExactMatch(evaluate.Comparison):
49
+ def _info(self):
50
+ return evaluate.ComparisonInfo(
51
+ module_type="comparison",
52
+ description=_DESCRIPTION,
53
+ citation=_CITATION,
54
+ inputs_description=_KWARGS_DESCRIPTION,
55
+ features=datasets.Features(
56
+ {
57
+ "predictions1": datasets.Value("int64"),
58
+ "predictions2": datasets.Value("int64"),
59
+ }
60
+ ),
61
+ )
62
+
63
+ def _compute(self, predictions1, predictions2):
64
+ score_list = [p1 == p2 for p1, p2 in zip(predictions1, predictions2)]
65
+ return {"exact_match": np.mean(score_list)}
evaluate/comparisons/exact_match/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2
+ scipy
evaluate/comparisons/mcnemar/README.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: McNemar
3
+ emoji: 🤗
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.0.2
8
+ app_file: app.py
9
+ pinned: false
10
+ tags:
11
+ - evaluate
12
+ - comparison
13
+ description: >-
14
+ McNemar's test is a diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with:
15
+ McNemar = (SE - SP)**2 / SE + SP
16
+ Where:
17
+ SE: Sensitivity (Test 1 positive; Test 2 negative)
18
+ SP: Specificity (Test 1 negative; Test 2 positive)
19
+ ---
20
+
21
+
22
+ # Comparison Card for McNemar
23
+
24
+ ## Comparison description
25
+
26
+ McNemar's test is a non-parametric diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with:
27
+
28
+ McNemar = (SE - SP)**2 / SE + SP
29
+
30
+ Where:
31
+ * SE: Sensitivity (Test 1 positive; Test 2 negative)
32
+ * SP: Specificity (Test 1 negative; Test 2 positive)
33
+
34
+ In other words, SE and SP are the diagonal elements of the contingency table for the classifier predictions (`predictions1` and `predictions2`) with respect to the ground truth `references`.
35
+
36
+ ## How to use
37
+
38
+ The McNemar comparison calculates the proportions of responses that exhibit disagreement between two classifiers. It is used to analyze paired nominal data.
39
+
40
+ ## Inputs
41
+
42
+ Its arguments are:
43
+
44
+ `predictions1`: a list of predictions from the first model.
45
+
46
+ `predictions2`: a list of predictions from the second model.
47
+
48
+ `references`: a list of the ground truth reference labels.
49
+
50
+ ## Output values
51
+
52
+ The McNemar comparison outputs two things:
53
+
54
+ `stat`: The McNemar statistic.
55
+
56
+ `p`: The p value.
57
+
58
+ ## Examples
59
+
60
+ Example comparison:
61
+
62
+ ```python
63
+ mcnemar = evaluate.load("mcnemar")
64
+ results = mcnemar.compute(references=[1, 0, 1], predictions1=[1, 1, 1], predictions2=[1, 0, 1])
65
+ print(results)
66
+ {'stat': 1.0, 'p': 0.31731050786291115}
67
+ ```
68
+
69
+ ## Limitations and bias
70
+
71
+ The McNemar test is a non-parametric test, so it has relatively few assumptions (basically only that the observations are independent). It should be used to analyze paired nominal data only.
72
+
73
+ ## Citations
74
+
75
+ ```bibtex
76
+ @article{mcnemar1947note,
77
+ title={Note on the sampling error of the difference between correlated proportions or percentages},
78
+ author={McNemar, Quinn},
79
+ journal={Psychometrika},
80
+ volume={12},
81
+ number={2},
82
+ pages={153--157},
83
+ year={1947},
84
+ publisher={Springer-Verlag}
85
+ }
86
+ ```
evaluate/comparisons/mcnemar/app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("mcnemar", module_type="comparison")
6
+ launch_gradio_widget(module)
evaluate/comparisons/mcnemar/mcnemar.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Evaluate Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """McNemar test for model comparison."""
15
+
16
+ import datasets
17
+ from scipy.stats import chi2
18
+
19
+ import evaluate
20
+
21
+
22
+ _DESCRIPTION = """
23
+ McNemar's test is a diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with:
24
+ McNemar = (SE - SP)**2 / SE + SP
25
+ Where:
26
+ SE: Sensitivity (Test 1 positive; Test 2 negative)
27
+ SP: Specificity (Test 1 negative; Test 2 positive)
28
+ """
29
+
30
+
31
+ _KWARGS_DESCRIPTION = """
32
+ Args:
33
+ predictions1 (`list` of `int`): Predicted labels for model 1.
34
+ predictions2 (`list` of `int`): Predicted labels for model 2.
35
+ references (`list` of `int`): Ground truth labels.
36
+
37
+ Returns:
38
+ stat (`float`): McNemar test score.
39
+ p (`float`): The p value. Minimum possible value is 0. Maximum possible value is 1.0. A lower p value means a more significant difference.
40
+
41
+ Examples:
42
+ >>> mcnemar = evaluate.load("mcnemar")
43
+ >>> results = mcnemar.compute(references=[1, 0, 1], predictions1=[1, 1, 1], predictions2=[1, 0, 1])
44
+ >>> print(results)
45
+ {'stat': 1.0, 'p': 0.31731050786291115}
46
+ """
47
+
48
+
49
+ _CITATION = """
50
+ @article{mcnemar1947note,
51
+ title={Note on the sampling error of the difference between correlated proportions or percentages},
52
+ author={McNemar, Quinn},
53
+ journal={Psychometrika},
54
+ volume={12},
55
+ number={2},
56
+ pages={153--157},
57
+ year={1947},
58
+ publisher={Springer-Verlag}
59
+ }
60
+ """
61
+
62
+
63
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
64
+ class McNemar(evaluate.Comparison):
65
+ def _info(self):
66
+ return evaluate.ComparisonInfo(
67
+ module_type="comparison",
68
+ description=_DESCRIPTION,
69
+ citation=_CITATION,
70
+ inputs_description=_KWARGS_DESCRIPTION,
71
+ features=datasets.Features(
72
+ {
73
+ "predictions1": datasets.Value("int64"),
74
+ "predictions2": datasets.Value("int64"),
75
+ "references": datasets.Value("int64"),
76
+ }
77
+ ),
78
+ )
79
+
80
+ def _compute(self, predictions1, predictions2, references):
81
+ # construct contingency table
82
+ tbl = [[0, 0], [0, 0]]
83
+ for gt, p1, p2 in zip(references, predictions1, predictions2):
84
+ if p1 == gt and p2 == gt:
85
+ tbl[0][0] += 1
86
+ elif p1 == gt:
87
+ tbl[0][1] += 1
88
+ elif p2 == gt:
89
+ tbl[1][0] += 1
90
+ else:
91
+ tbl[1][1] += 1
92
+
93
+ # compute statistic
94
+ b, c = tbl[0][1], tbl[1][0]
95
+ statistic = abs(b - c) ** 2 / (1.0 * (b + c))
96
+ df = 1
97
+ pvalue = chi2.sf(statistic, df)
98
+ return {"stat": statistic, "p": pvalue}
evaluate/comparisons/mcnemar/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2
+ scipy
evaluate/comparisons/wilcoxon/README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Wilcoxon
3
+ emoji: 🤗
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.0.2
8
+ app_file: app.py
9
+ pinned: false
10
+ tags:
11
+ - evaluate
12
+ - comparison
13
+ description: >-
14
+ Wilcoxon's test is a signed-rank test for comparing paired samples.
15
+ ---
16
+
17
+
18
+ # Comparison Card for Wilcoxon
19
+
20
+ ## Comparison description
21
+
22
+ Wilcoxon's test is a non-parametric signed-rank test that tests whether the distribution of the differences is symmetric about zero. It can be used to compare the predictions of two models.
23
+
24
+ ## How to use
25
+
26
+ The Wilcoxon comparison is used to analyze paired ordinal data.
27
+
28
+ ## Inputs
29
+
30
+ Its arguments are:
31
+
32
+ `predictions1`: a list of predictions from the first model.
33
+
34
+ `predictions2`: a list of predictions from the second model.
35
+
36
+ ## Output values
37
+
38
+ The Wilcoxon comparison outputs two things:
39
+
40
+ `stat`: The Wilcoxon statistic.
41
+
42
+ `p`: The p value.
43
+
44
+ ## Examples
45
+
46
+ Example comparison:
47
+
48
+ ```python
49
+ wilcoxon = evaluate.load("wilcoxon")
50
+ results = wilcoxon.compute(predictions1=[-7, 123.45, 43, 4.91, 5], predictions2=[1337.12, -9.74, 1, 2, 3.21])
51
+ print(results)
52
+ {'stat': 5.0, 'p': 0.625}
53
+ ```
54
+
55
+ ## Limitations and bias
56
+
57
+ The Wilcoxon test is a non-parametric test, so it has relatively few assumptions (basically only that the observations are independent). It should be used to analyze paired ordinal data only.
58
+
59
+ ## Citations
60
+
61
+ ```bibtex
62
+ @incollection{wilcoxon1992individual,
63
+ title={Individual comparisons by ranking methods},
64
+ author={Wilcoxon, Frank},
65
+ booktitle={Breakthroughs in statistics},
66
+ pages={196--202},
67
+ year={1992},
68
+ publisher={Springer}
69
+ }
70
+ ```
evaluate/comparisons/wilcoxon/app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("wilcoxon", module_type="comparison")
6
+ launch_gradio_widget(module)
evaluate/comparisons/wilcoxon/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/huggingface/evaluate@a45df1eb9996eec64ec3282ebe554061cb366388
2
+ datasets~=2.0
3
+ scipy
evaluate/comparisons/wilcoxon/wilcoxon.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Evaluate Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Wilcoxon test for model comparison."""
15
+
16
+ import datasets
17
+ from scipy.stats import wilcoxon
18
+
19
+ import evaluate
20
+
21
+
22
+ _DESCRIPTION = """
23
+ Wilcoxon's test is a non-parametric signed-rank test that tests whether the distribution of the differences is symmetric about zero. It can be used to compare the predictions of two models.
24
+ """
25
+
26
+
27
+ _KWARGS_DESCRIPTION = """
28
+ Args:
29
+ predictions1 (`list` of `float`): Predictions for model 1.
30
+ predictions2 (`list` of `float`): Predictions for model 2.
31
+
32
+ Returns:
33
+ stat (`float`): Wilcoxon test score.
34
+ p (`float`): The p value. Minimum possible value is 0. Maximum possible value is 1.0. A lower p value means a more significant difference.
35
+
36
+ Examples:
37
+ >>> wilcoxon = evaluate.load("wilcoxon")
38
+ >>> results = wilcoxon.compute(predictions1=[-7, 123.45, 43, 4.91, 5], predictions2=[1337.12, -9.74, 1, 2, 3.21])
39
+ >>> print(results)
40
+ {'stat': 5.0, 'p': 0.625}
41
+ """
42
+
43
+
44
+ _CITATION = """
45
+ @incollection{wilcoxon1992individual,
46
+ title={Individual comparisons by ranking methods},
47
+ author={Wilcoxon, Frank},
48
+ booktitle={Breakthroughs in statistics},
49
+ pages={196--202},
50
+ year={1992},
51
+ publisher={Springer}
52
+ }
53
+ """
54
+
55
+
56
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
57
+ class Wilcoxon(evaluate.Comparison):
58
+ def _info(self):
59
+ return evaluate.ComparisonInfo(
60
+ module_type="comparison",
61
+ description=_DESCRIPTION,
62
+ citation=_CITATION,
63
+ inputs_description=_KWARGS_DESCRIPTION,
64
+ features=datasets.Features(
65
+ {
66
+ "predictions1": datasets.Value("float"),
67
+ "predictions2": datasets.Value("float"),
68
+ }
69
+ ),
70
+ )
71
+
72
+ def _compute(self, predictions1, predictions2):
73
+ # calculate difference
74
+ d = [p1 - p2 for (p1, p2) in zip(predictions1, predictions2)]
75
+
76
+ # compute statistic
77
+ res = wilcoxon(d)
78
+ return {"stat": res.statistic, "p": res.pvalue}
evaluate/docs/README.md ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ # Generating the documentation
18
+
19
+ To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
20
+ you can install them with the following command, at the root of the code repository:
21
+
22
+ ```bash
23
+ pip install -e ".[docs]"
24
+ ```
25
+
26
+ Then you need to install our special tool that builds the documentation:
27
+
28
+ ```bash
29
+ pip install git+https://github.com/huggingface/doc-builder
30
+ ```
31
+
32
+ ---
33
+ **NOTE**
34
+
35
+ You only need to generate the documentation to inspect it locally (if you're planning changes and want to
36
+ check how they look like before committing for instance). You don't have to commit the built documentation.
37
+
38
+ ---
39
+
40
+ ## Building the documentation
41
+
42
+ Once you have setup the `doc-builder` and additional packages, you can generate the documentation by typing th
43
+ following command:
44
+
45
+ ```bash
46
+ doc-builder build transformers docs/source/ --build_dir ~/tmp/test-build
47
+ ```
48
+
49
+ You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate
50
+ the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
51
+ Markdown editor.
52
+
53
+ ---
54
+ **NOTE**
55
+
56
+ It's not possible to see locally how the final documentation will look like for now. Once you have opened a PR, you
57
+ will see a bot add a comment to a link where the documentation with your changes lives.
58
+
59
+ ---
60
+
61
+ ## Adding a new element to the navigation bar
62
+
63
+ Accepted files are Markdown (.md or .mdx).
64
+
65
+ Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
66
+ the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/master/docs/source/_toctree.yml) file.
67
+
68
+ ## Renaming section headers and moving sections
69
+
70
+ It helps to keep the old links working when renaming section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums and Social media and it'd be make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
71
+
72
+ Therefore we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
73
+
74
+ So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
75
+
76
+ ```
77
+ Sections that were moved:
78
+
79
+ [ <a href="#section-b">Section A</a><a id="section-a"></a> ]
80
+ ```
81
+ and of course if you moved it to another file, then:
82
+
83
+ ```
84
+ Sections that were moved:
85
+
86
+ [ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
87
+ ```
88
+
89
+ Use the relative style to link to the new file so that the versioned docs continue to work.
90
+
91
+ For an example of a rich moved sections set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/master/docs/source/main_classes/trainer.mdx).
92
+
93
+
94
+ ## Writing Documentation - Specification
95
+
96
+ The `huggingface/transformers` documentation follows the
97
+ [Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,
98
+ although we can write them directly in Markdown.
99
+
100
+ ### Adding a new tutorial
101
+
102
+ Adding a new tutorial or section is done in two steps:
103
+
104
+ - Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
105
+ - Link that file in `./source/_toctree.yml` on the correct toc-tree.
106
+
107
+ Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
108
+ depending on the intended targets (beginners, more advanced users or researchers) it should go in section two, three or
109
+ four.
110
+
111
+ ### Adding a new model
112
+
113
+ When adding a new model:
114
+
115
+ - Create a file `xxx.mdx` or under `./source/model_doc` (don't hesitate to copy an existing file as template).
116
+ - Link that file in `./source/_toctree.yml`.
117
+ - Write a short overview of the model:
118
+ - Overview with paper & authors
119
+ - Paper abstract
120
+ - Tips and tricks and how to use it best
121
+ - Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
122
+ every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
123
+ The order is generally:
124
+ - Configuration,
125
+ - Tokenizer
126
+ - PyTorch base model
127
+ - PyTorch head models
128
+ - TensorFlow base model
129
+ - TensorFlow head models
130
+ - Flax base model
131
+ - Flax head models
132
+
133
+ These classes should be added using our Markdown syntax. Usually as follows:
134
+
135
+ ```
136
+ ## XXXConfig
137
+
138
+ [[autodoc]] XXXConfig
139
+ ```
140
+
141
+ This will include every public method of the configuration that is documented. If for some reason you wish for a method
142
+ not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
143
+
144
+ ```
145
+ ## XXXTokenizer
146
+
147
+ [[autodoc]] XXXTokenizer
148
+ - build_inputs_with_special_tokens
149
+ - get_special_tokens_mask
150
+ - create_token_type_ids_from_sequences
151
+ - save_vocabulary
152
+ ```
153
+
154
+ If you just want to add a method that is not documented (for instance magic method like `__call__` are not documented
155
+ byt default) you can put the list of methods to add in a list that contains `all`:
156
+
157
+ ```
158
+ ## XXXTokenizer
159
+
160
+ [[autodoc]] XXXTokenizer
161
+ - all
162
+ - __call__
163
+ ```
164
+
165
+ ### Writing source documentation
166
+
167
+ Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
168
+ and objects like True, None or any strings should usually be put in `code`.
169
+
170
+ When mentioning a class, function or method, it is recommended to use our syntax for internal links so that our tool
171
+ adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or
172
+ function to be in the main package.
173
+
174
+ If you want to create a link to some internal class or function, you need to
175
+ provide its path. For instance: \[\`file_utils.ModelOutput\`\]. This will be converted into a link with
176
+ `file_utils.ModelOutput` in the description. To get rid of the path and only keep the name of the object you are
177
+ linking to in the description, add a ~: \[\`~file_utils.ModelOutput\`\] will generate a link with `ModelOutput` in the description.
178
+
179
+ The same works for methods so you can either use \[\`XXXClass.method\`\] or \[~\`XXXClass.method\`\].
180
+
181
+ #### Defining arguments in a method
182
+
183
+ Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and
184
+ an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon and its
185
+ description:
186
+
187
+ ```
188
+ Args:
189
+ n_layers (`int`): The number of layers of the model.
190
+ ```
191
+
192
+ If the description is too long to fit in one line, another indentation is necessary before writing the description
193
+ after th argument.
194
+
195
+ Here's an example showcasing everything so far:
196
+
197
+ ```
198
+ Args:
199
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
200
+ Indices of input sequence tokens in the vocabulary.
201
+
202
+ Indices can be obtained using [`AlbertTokenizer`]. See [`~PreTrainedTokenizer.encode`] and
203
+ [`~PreTrainedTokenizer.__call__`] for details.
204
+
205
+ [What are input IDs?](../glossary#input-ids)
206
+ ```
207
+
208
+ For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
209
+ following signature:
210
+
211
+ ```
212
+ def my_function(x: str = None, a: float = 1):
213
+ ```
214
+
215
+ then its documentation should look like this:
216
+
217
+ ```
218
+ Args:
219
+ x (`str`, *optional*):
220
+ This argument controls ...
221
+ a (`float`, *optional*, defaults to 1):
222
+ This argument is used to ...
223
+ ```
224
+
225
+ Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even
226
+ if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
227
+ however write as many lines as you want in the indented description (see the example above with `input_ids`).
228
+
229
+ #### Writing a multi-line code block
230
+
231
+ Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
232
+
233
+
234
+ ````
235
+ ```
236
+ # first line of code
237
+ # second line
238
+ # etc
239
+ ```
240
+ ````
241
+
242
+ We follow the [doctest](https://docs.python.org/3/library/doctest.html) syntax for the examples to automatically test
243
+ the results stay consistent with the library.
244
+
245
+ #### Writing a return block
246
+
247
+ The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.
248
+ The first line should be the type of the return, followed by a line return. No need to indent further for the elements
249
+ building the return.
250
+
251
+ Here's an example for a single value return:
252
+
253
+ ```
254
+ Returns:
255
+ `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
256
+ ```
257
+
258
+ Here's an example for tuple return, comprising several objects:
259
+
260
+ ```
261
+ Returns:
262
+ `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
263
+ - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
264
+ Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
265
+ - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
266
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
267
+ ```
268
+
269
+ #### Adding an image
270
+
271
+ Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
272
+ the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
273
+ them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
274
+ If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
275
+ to this dataset.
276
+
277
+ ## Styling the docstring
278
+
279
+ We have an automatic script running with the `make style` comment that will make sure that:
280
+ - the docstrings fully take advantage of the line width
281
+ - all code examples are formatted using black, like the code of the Transformers library
282
+
283
+ This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
284
+ recommended to commit your changes before running `make style`, so you can revert the changes done by that script
285
+ easily.
evaluate/docs/source/_toctree.yml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - sections:
2
+ - local: index
3
+ title: 🤗 Evaluate
4
+ title: Get started
5
+ - sections:
6
+ - local: installation
7
+ title: Installation
8
+ - local: a_quick_tour
9
+ title: A quick tour
10
+ title: Tutorials
11
+ - sections:
12
+ - local: choosing_a_metric
13
+ title: Choosing the right metric
14
+ - local: creating_and_sharing
15
+ title: Adding new evaluations
16
+ - local: base_evaluator
17
+ title: Using the evaluator
18
+ - local: custom_evaluator
19
+ title: Using the evaluator with custom pipelines
20
+ - local: evaluation_suite
21
+ title: Creating an EvaluationSuite
22
+ - sections:
23
+ - local: transformers_integrations
24
+ title: Transformers
25
+ - local: keras_integrations
26
+ title: Keras and Tensorflow
27
+ - local: sklearn_integrations
28
+ title: scikit-learn
29
+ title: Using 🤗 Evaluate with other ML frameworks
30
+ title: "How-to guides"
31
+ - sections:
32
+ - local: types_of_evaluations
33
+ title: Types of evaluations
34
+ - local: considerations
35
+ title: Considerations for model evaluation
36
+ title: "Conceptual guides"
37
+ - sections:
38
+ - local: package_reference/main_classes
39
+ title: Main classes
40
+ - local: package_reference/loading_methods
41
+ title: Loading methods
42
+ - local: package_reference/saving_methods
43
+ title: Saving methods
44
+ - local: package_reference/hub_methods
45
+ title: Hub methods
46
+ - local: package_reference/evaluator_classes
47
+ title: Evaluator classes
48
+ - local: package_reference/visualization_methods
49
+ title: Visualization methods
50
+ - local: package_reference/logging_methods
51
+ title: Logging methods
52
+ title: "Reference"
evaluate/docs/source/a_quick_tour.mdx ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A quick tour
2
+
3
+ 🤗 Evaluate provides access to a wide range of evaluation tools. It covers a range of modalities such as text, computer vision, audio, etc. as well as tools to evaluate models or datasets. These tools are split into three categories.
4
+
5
+ ## Types of evaluations
6
+
7
+ There are different aspects of a typical machine learning pipeline that can be evaluated and for each aspect 🤗 Evaluate provides a tool:
8
+
9
+ - **Metric**: A metric is used to evaluate a model's performance and usually involves the model's predictions as well as some ground truth labels. You can find all integrated metrics at [evaluate-metric](https://huggingface.co/evaluate-metric).
10
+ - **Comparison**: A comparison is used to compare two models. This can for example be done by comparing their predictions to ground truth labels and computing their agreement. You can find all integrated comparisons at [evaluate-comparison](https://huggingface.co/evaluate-comparison).
11
+ - **Measurement**: The dataset is as important as the model trained on it. With measurements one can investigate a dataset's properties. You can find all integrated measurements at [evaluate-measurement](https://huggingface.co/evaluate-measurement).
12
+
13
+ Each of these evaluation modules live on Hugging Face Hub as a Space. They come with an interactive widget and a documentation card documenting its use and limitations. For example [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy):
14
+
15
+ <div class="flex justify-center">
16
+ <img src="https://huggingface.co/datasets/evaluate/media/resolve/main/metric-widget.png" width="400"/>
17
+ </div>
18
+
19
+ Each metric, comparison, and measurement is a separate Python module, but for using any of them, there is a single entry point: [`evaluate.load`]!
20
+
21
+ ## Load
22
+
23
+ Any metric, comparison, or measurement is loaded with the `evaluate.load` function:
24
+
25
+ ```py
26
+ >>> import evaluate
27
+ >>> accuracy = evaluate.load("accuracy")
28
+ ```
29
+
30
+ If you want to make sure you are loading the right type of evaluation (especially if there are name clashes) you can explicitly pass the type:
31
+
32
+ ```py
33
+ >>> word_length = evaluate.load("word_length", module_type="measurement")
34
+ ```
35
+
36
+ ### Community modules
37
+
38
+ Besides the modules implemented in 🤗 Evaluate you can also load any community module by specifying the repository ID of the metric implementation:
39
+
40
+ ```py
41
+ >>> element_count = evaluate.load("lvwerra/element_count", module_type="measurement")
42
+ ```
43
+
44
+ See the [Creating and Sharing Guide](/docs/evaluate/main/en/creating_and_sharing) for information about uploading custom metrics.
45
+
46
+ ### List available modules
47
+
48
+ With [`list_evaluation_modules`] you can check what modules are available on the hub. You can also filter for a specific modules and skip community metrics if you want. You can also see additional information such as likes:
49
+
50
+ ```python
51
+ >>> evaluate.list_evaluation_modules(
52
+ ... module_type="comparison",
53
+ ... include_community=False,
54
+ ... with_details=True)
55
+
56
+ [{'name': 'mcnemar', 'type': 'comparison', 'community': False, 'likes': 1},
57
+ {'name': 'exact_match', 'type': 'comparison', 'community': False, 'likes': 0}]
58
+ ```
59
+
60
+ ## Module attributes
61
+
62
+ All evalution modules come with a range of useful attributes that help to use a module stored in a [`EvaluationModuleInfo`] object.
63
+
64
+ |Attribute|Description|
65
+ |---|---|
66
+ |`description`|A short description of the evaluation module.|
67
+ |`citation`|A BibTex string for citation when available.|
68
+ |`features`|A `Features` object defining the input format.|
69
+ |`inputs_description`|This is equivalent to the modules docstring.|
70
+ |`homepage`|The homepage of the module.|
71
+ |`license`|The license of the module.|
72
+ |`codebase_urls`|Link to the code behind the module.|
73
+ |`reference_urls`|Additional reference URLs.|
74
+
75
+ Let's have a look at a few examples. First, let's look at the `description` attribute of the accuracy metric:
76
+
77
+ ```py
78
+ >>> accuracy = evaluate.load("accuracy")
79
+ >>> accuracy.description
80
+ Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
81
+ Accuracy = (TP + TN) / (TP + TN + FP + FN)
82
+ Where:
83
+ TP: True positive
84
+ TN: True negative
85
+ FP: False positive
86
+ FN: False negative
87
+ ```
88
+
89
+ You can see that it describes how the metric works in theory. If you use this metric for your work, especially if it is an academic publication you want to reference it properly. For that you can look at the `citation` attribute:
90
+
91
+ ```py
92
+ >>> accuracy.citation
93
+ @article{scikit-learn,
94
+ title={Scikit-learn: Machine Learning in {P}ython},
95
+ author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
96
+ and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
97
+ and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
98
+ Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
99
+ journal={Journal of Machine Learning Research},
100
+ volume={12},
101
+ pages={2825--2830},
102
+ year={2011}
103
+ }
104
+ ```
105
+
106
+ Before we can apply a metric or other evaluation module to a use-case, we need to know what the input format of the metric is:
107
+
108
+ ```py
109
+ >>> accuracy.features
110
+ {
111
+ 'predictions': Value(dtype='int32', id=None),
112
+ 'references': Value(dtype='int32', id=None)
113
+ }
114
+ ```
115
+
116
+ <Tip>
117
+
118
+ Note that features always describe the type of a single input element. In general we will add lists of elements so you can always think of a list around the types in `features`. Evaluate accepts various input formats (Python lists, NumPy arrays, PyTorch tensors, etc.) and converts them to an appropriate format for storage and computation.
119
+
120
+ </Tip>
121
+
122
+ ## Compute
123
+
124
+ Now that we know how the evaluation module works and what should go in there we want to actually use it! When it comes to computing the actual score there are two main ways to do it:
125
+
126
+ 1. All-in-one
127
+ 2. Incremental
128
+
129
+
130
+ In the incremental approach the necessary inputs are added to the module with [`EvaluationModule.add`] or [`EvaluationModule.add_batch`] and the score is calculated at the end with [`EvaluationModule.compute`]. Alternatively, one can pass all the inputs at once to `compute()`. Let's have a look at the two approaches.
131
+
132
+ ### How to compute
133
+
134
+ The simplest way to calculate the score of an evaluation module is by calling `compute()` directly with the necessary inputs. Simply pass the inputs as seen in `features` to the `compute()` method.
135
+
136
+ ```py
137
+ >>> accuracy.compute(references=[0,1,0,1], predictions=[1,0,0,1])
138
+ {'accuracy': 0.5}
139
+ ```
140
+ Evaluation modules return the results in a dictionary. However, in some instances you build up the predictions iteratively or in a distributed fashion in which case `add()` or `add_batch()` are useful.
141
+
142
+ ### Calculate a single metric or a batch of metrics
143
+
144
+ In many evaluation pipelines you build the predictions iteratively such as in a for-loop. In that case you could store the predictions in a list and at the end pass them to `compute()`. With `add()` and `add_batch()` you can circumvent the step of storing the predictions separately. If you are only creating single predictions at a time you can use `add()`:
145
+
146
+ ```py
147
+ >>> for ref, pred in zip([0,1,0,1], [1,0,0,1]):
148
+ >>> accuracy.add(references=ref, predictions=pred)
149
+ >>> accuracy.compute()
150
+ {'accuracy': 0.5}
151
+ ```
152
+
153
+ Once you have gathered all predictions you can call `compute()` to compute the score based on all stored values. When getting predictions and references in batches you can use `add_batch()` which adds a list elements for later processing. The rest works as with `add()`:
154
+
155
+ ```py
156
+ >>> for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
157
+ >>> accuracy.add_batch(references=refs, predictions=preds)
158
+ >>> accuracy.compute()
159
+ {'accuracy': 0.5}
160
+ ```
161
+
162
+ This is especially useful when you need to get the predictions from your model in batches:
163
+
164
+ ```py
165
+ >>> for model_inputs, gold_standards in evaluation_dataset:
166
+ >>> predictions = model(model_inputs)
167
+ >>> metric.add_batch(references=gold_standards, predictions=predictions)
168
+ >>> metric.compute()
169
+ ```
170
+
171
+ ### Distributed evaluation
172
+
173
+ Computing metrics in a distributed environment can be tricky. Metric evaluation is executed in separate Python processes, or nodes, on different subsets of a dataset. Typically, when a metric score is additive (`f(AuB) = f(A) + f(B)`), you can use distributed reduce operations to gather the scores for each subset of the dataset. But when a metric is non-additive (`f(AuB) ≠ f(A) + f(B)`), it's not that simple. For example, you can't take the sum of the [F1](https://huggingface.co/spaces/evaluate-metric/f1) scores of each data subset as your **final metric**.
174
+
175
+ A common way to overcome this issue is to fallback on single process evaluation. The metrics are evaluated on a single GPU, which becomes inefficient.
176
+
177
+ 🤗 Evaluate solves this issue by only computing the final metric on the first node. The predictions and references are computed and provided to the metric separately for each node. These are temporarily stored in an Apache Arrow table, avoiding cluttering the GPU or CPU memory. When you are ready to `compute()` the final metric, the first node is able to access the predictions and references stored on all the other nodes. Once it has gathered all the predictions and references, `compute()` will perform the final metric evaluation.
178
+
179
+ This solution allows 🤗 Evaluate to perform distributed predictions, which is important for evaluation speed in distributed settings. At the same time, you can also use complex non-additive metrics without wasting valuable GPU or CPU memory.
180
+
181
+ ## Combining several evaluations
182
+
183
+ Often one wants to not only evaluate a single metric but a range of different metrics capturing different aspects of a model. E.g. for classification it is usually a good idea to compute F1-score, recall, and precision in addition to accuracy to get a better picture of model performance. Naturally, you can load a bunch of metrics and call them sequentially. However, a more convenient way is to use the [`~evaluate.combine`] function to bundle them together:
184
+
185
+ ```python
186
+ >>> clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
187
+ ```
188
+
189
+ The `combine` function accepts both the list of names of the metrics as well as an instantiated modules. The `compute` call then computes each metric:
190
+
191
+ ```python
192
+ >>> clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])
193
+
194
+ {
195
+ 'accuracy': 0.667,
196
+ 'f1': 0.667,
197
+ 'precision': 1.0,
198
+ 'recall': 0.5
199
+ }
200
+ ```
201
+
202
+ ## Save and push to the Hub
203
+
204
+ Saving and sharing evaluation results is an important step. We provide the [`evaluate.save`] function to easily save metrics results. You can either pass a specific filename or a directory. In the latter case, the results are saved in a file with an automatically created file name. Besides the directory or file name, the function takes any key-value pairs as inputs and stores them in a JSON file.
205
+
206
+ ```py
207
+ >>> result = accuracy.compute(references=[0,1,0,1], predictions=[1,0,0,1])
208
+
209
+ >>> hyperparams = {"model": "bert-base-uncased"}
210
+ >>> evaluate.save("./results/", experiment="run 42", **result, **hyperparams)
211
+ PosixPath('results/result-2022_05_30-22_09_11.json')
212
+ ```
213
+
214
+ The content of the JSON file look like the following:
215
+
216
+ ```json
217
+ {
218
+ "experiment": "run 42",
219
+ "accuracy": 0.5,
220
+ "model": "bert-base-uncased",
221
+ "_timestamp": "2022-05-30T22:09:11.959469",
222
+ "_git_commit_hash": "123456789abcdefghijkl",
223
+ "_evaluate_version": "0.1.0",
224
+ "_python_version": "3.9.12 (main, Mar 26 2022, 15:51:15) \n[Clang 13.1.6 (clang-1316.0.21.2)]",
225
+ "_interpreter_path": "/Users/leandro/git/evaluate/env/bin/python"
226
+ }
227
+ ```
228
+
229
+ In addition to the specified fields, it also contains useful system information for reproducing the results.
230
+
231
+ Besides storing the results locally, you should report them on the model's repository on the Hub. With the [`evaluate.push_to_hub`] function, you can easily report evaluation results to the model's repository:
232
+
233
+ ```py
234
+ evaluate.push_to_hub(
235
+ model_id="huggingface/gpt2-wikitext2", # model repository on hub
236
+ metric_value=0.5, # metric value
237
+ metric_type="bleu", # metric name, e.g. accuracy.name
238
+ metric_name="BLEU", # pretty name which is displayed
239
+ dataset_type="wikitext", # dataset name on the hub
240
+ dataset_name="WikiText", # pretty name
241
+ dataset_split="test", # dataset split used
242
+ task_type="text-generation", # task id, see https://github.com/huggingface/evaluate/blob/main/src/evaluate/config.py#L154-L192
243
+ task_name="Text Generation" # pretty name for task
244
+ )
245
+ ```
246
+
247
+ ## Evaluator
248
+
249
+ The [`evaluate.evaluator`] provides automated evaluation and only requires a model, dataset, metric in contrast to the metrics in `EvaluationModule`s that require the model's predictions. As such it is easier to evaluate a model on a dataset with a given metric as the inference is handled internally. To make that possible it uses the [`~transformers.pipeline`] abstraction from `transformers`. However, you can use your own framework as long as it follows the `pipeline` interface.
250
+
251
+ To make an evaluation with the `evaluator` let's load a `transformers` pipeline (but you can pass your own custom inference class for any framework as long as it follows the pipeline call API) with an model trained on IMDb, the IMDb test split and the accuracy metric.
252
+
253
+ ```python
254
+ from transformers import pipeline
255
+ from datasets import load_dataset
256
+ from evaluate import evaluator
257
+ import evaluate
258
+
259
+ pipe = pipeline("text-classification", model="lvwerra/distilbert-imdb", device=0)
260
+ data = load_dataset("imdb", split="test").shuffle().select(range(1000))
261
+ metric = evaluate.load("accuracy")
262
+ ```
263
+
264
+ Then you can create an evaluator for text classification and pass the three objects to the `compute()` method. With the label mapping `evaluate` provides a method to align the pipeline outputs with the label column in the dataset:
265
+
266
+ ```python
267
+ >>> task_evaluator = evaluator("text-classification")
268
+
269
+ >>> results = task_evaluator.compute(model_or_pipeline=pipe, data=data, metric=metric,
270
+ ... label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
271
+
272
+ >>> print(results)
273
+ {'accuracy': 0.934}
274
+ ```
275
+
276
+ Calculating the value of the metric alone is often not enough to know if a model performs significantly better than another one. With _bootstrapping_ `evaluate` computes confidence intervals and the standard error which helps estimate how stable a score is:
277
+
278
+ ```python
279
+ >>> results = eval.compute(model_or_pipeline=pipe, data=data, metric=metric,
280
+ ... label_mapping={"NEGATIVE": 0, "POSITIVE": 1},
281
+ ... strategy="bootstrap", n_resamples=200)
282
+
283
+ >>> print(results)
284
+ {'accuracy':
285
+ {
286
+ 'confidence_interval': (0.906, 0.9406749892841922),
287
+ 'standard_error': 0.00865213251082787,
288
+ 'score': 0.923
289
+ }
290
+ }
291
+ ```
292
+
293
+ The evaluator expects a `"text"` and `"label"` column for the data input. If your dataset differs you can provide the columns with the keywords `input_column="text"` and `label_column="label"`. Currently only `"text-classification"` is supported with more tasks being added in the future.
294
+
295
+
296
+ ## Visualization
297
+
298
+ When comparing several models, sometimes it's hard to spot the differences in their performance simply by looking at their scores. Also often there is not a single best model but there are trade-offs between e.g. latency and accuracy as larger models might have better performance but are also slower. We are gradually adding different visualization approaches, such as plots, to make choosing the best model for a use-case easier.
299
+
300
+ For instance, if you have a list of results from multiple models (as dictionaries), you can feed them into the `radar_plot()` function:
301
+
302
+ ```python
303
+ import evaluate
304
+ from evaluate.visualization import radar_plot
305
+
306
+ >>> data = [
307
+ {"accuracy": 0.99, "precision": 0.8, "f1": 0.95, "latency_in_seconds": 33.6},
308
+ {"accuracy": 0.98, "precision": 0.87, "f1": 0.91, "latency_in_seconds": 11.2},
309
+ {"accuracy": 0.98, "precision": 0.78, "f1": 0.88, "latency_in_seconds": 87.6},
310
+ {"accuracy": 0.88, "precision": 0.78, "f1": 0.81, "latency_in_seconds": 101.6}
311
+ ]
312
+ >>> model_names = ["Model 1", "Model 2", "Model 3", "Model 4"]
313
+ >>> plot = radar_plot(data=data, model_names=model_names)
314
+ >>> plot.show()
315
+ ```
316
+
317
+ Which lets you visually compare the 4 models and choose the optimal one for you, based on one or several metrics:
318
+ <div class="flex justify-center">
319
+ <img src="https://huggingface.co/datasets/evaluate/media/resolve/main/example_viz.png" width="400"/>
320
+ </div>
321
+
322
+ ## Running evaluation on a suite of tasks
323
+
324
+ It can be useful to evaluate models on a variety of different tasks to understand their downstream performance. The [EvaluationSuite](evaluation_suite) enables evaluation of models on a collection of tasks. Tasks can be constructed as ([evaluator](base_evaluator), dataset, metric) tuples and passed to an [EvaluationSuite](evaluation_suite) stored on the Hugging Face Hub as a Space, or locally as a Python script. See the [evaluator documentation](base_evaluator) for a list of currently supported tasks.
325
+
326
+ `EvaluationSuite` scripts can be defined as follows, and supports Python code for data preprocessing.
327
+
328
+ ```python
329
+ import evaluate
330
+ from evaluate.evaluation_suite import SubTask
331
+
332
+ class Suite(evaluate.EvaluationSuite):
333
+
334
+ def __init__(self, name):
335
+ super().__init__(name)
336
+
337
+ self.suite = [
338
+ SubTask(
339
+ task_type="text-classification",
340
+ data="imdb",
341
+ split="test[:1]",
342
+ args_for_task={
343
+ "metric": "accuracy",
344
+ "input_column": "text",
345
+ "label_column": "label",
346
+ "label_mapping": {
347
+ "LABEL_0": 0.0,
348
+ "LABEL_1": 1.0
349
+ }
350
+ }
351
+ ),
352
+ SubTask(
353
+ task_type="text-classification",
354
+ data="sst2",
355
+ split="test[:1]",
356
+ args_for_task={
357
+ "metric": "accuracy",
358
+ "input_column": "sentence",
359
+ "label_column": "label",
360
+ "label_mapping": {
361
+ "LABEL_0": 0.0,
362
+ "LABEL_1": 1.0
363
+ }
364
+ }
365
+ )
366
+ ]
367
+ ```
368
+
369
+ Evaluation can be run by loading the `EvaluationSuite` and calling the `run()` method with a model or pipeline.
370
+
371
+ ```
372
+ >>> from evaluate import EvaluationSuite
373
+ >>> suite = EvaluationSuite.load('mathemakitten/sentiment-evaluation-suite')
374
+ >>> results = suite.run("huggingface/prunebert-base-uncased-6-finepruned-w-distil-mnli")
375
+ ```
376
+
377
+ | accuracy | total_time_in_seconds | samples_per_second | latency_in_seconds | task_name |
378
+ |------------:|---------------------:|--------------------------:|:----------------|:-----------|
379
+ | 0.3 | 4.62804 | 2.16074 | 0.462804 | imdb |
380
+ | 0 | 0.686388 | 14.569 | 0.0686388 | sst2 |
evaluate/docs/source/base_evaluator.mdx ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Using the `evaluator`
2
+
3
+ The `Evaluator` classes allow to evaluate a triplet of model, dataset, and metric. The models wrapped in a pipeline, responsible for handling all preprocessing and post-processing and out-of-the-box, `Evaluator`s support transformers pipelines for the supported tasks, but custom pipelines can be passed, as showcased in the section [Using the `evaluator` with custom pipelines](custom_evaluator).
4
+
5
+ Currently supported tasks are:
6
+ - `"text-classification"`: will use the [`TextClassificationEvaluator`].
7
+ - `"token-classification"`: will use the [`TokenClassificationEvaluator`].
8
+ - `"question-answering"`: will use the [`QuestionAnsweringEvaluator`].
9
+ - `"image-classification"`: will use the [`ImageClassificationEvaluator`].
10
+ - `"text-generation"`: will use the [`TextGenerationEvaluator`].
11
+ - `"text2text-generation"`: will use the [`Text2TextGenerationEvaluator`].
12
+ - `"summarization"`: will use the [`SummarizationEvaluator`].
13
+ - `"translation"`: will use the [`TranslationEvaluator`].
14
+ - `"automatic-speech-recognition"`: will use the [`AutomaticSpeechRecognitionEvaluator`].
15
+ - `"audio-classification"`: will use the [`AudioClassificationEvaluator`].
16
+
17
+ To run an `Evaluator` with several tasks in a single call, use the [EvaluationSuite](evaluation_suite), which runs evaluations on a collection of `SubTask`s.
18
+
19
+ Each task has its own set of requirements for the dataset format and pipeline output, make sure to check them out for your custom use case. Let's have a look at some of them and see how you can use the evaluator to evalute a single or multiple of models, datasets, and metrics at the same time.
20
+
21
+ ## Text classification
22
+
23
+ The text classification evaluator can be used to evaluate text models on classification datasets such as IMDb. Beside the model, data, and metric inputs it takes the following optional inputs:
24
+
25
+ - `input_column="text"`: with this argument the column with the data for the pipeline can be specified.
26
+ - `label_column="label"`: with this argument the column with the labels for the evaluation can be specified.
27
+ - `label_mapping=None`: the label mapping aligns the labels in the pipeline output with the labels need for evaluation. E.g. the labels in `label_column` can be integers (`0`/`1`) whereas the pipeline can produce label names such as `"positive"`/`"negative"`. With that dictionary the pipeline outputs are mapped to the labels.
28
+
29
+ By default the `"accuracy"` metric is computed.
30
+
31
+ ### Evaluate models on the Hub
32
+
33
+ There are several ways to pass a model to the evaluator: you can pass the name of a model on the Hub, you can load a `transformers` model and pass it to the evaluator or you can pass an initialized `transformers.Pipeline`. Alternatively you can pass any callable function that behaves like a `pipeline` call for the task in any framework.
34
+
35
+ So any of the following works:
36
+
37
+ ```py
38
+ from datasets import load_dataset
39
+ from evaluate import evaluator
40
+ from transformers import AutoModelForSequenceClassification, pipeline
41
+
42
+ data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))
43
+ task_evaluator = evaluator("text-classification")
44
+
45
+ # 1. Pass a model name or path
46
+ eval_results = task_evaluator.compute(
47
+ model_or_pipeline="lvwerra/distilbert-imdb",
48
+ data=data,
49
+ label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
50
+ )
51
+
52
+ # 2. Pass an instantiated model
53
+ model = AutoModelForSequenceClassification.from_pretrained("lvwerra/distilbert-imdb")
54
+
55
+ eval_results = task_evaluator.compute(
56
+ model_or_pipeline=model,
57
+ data=data,
58
+ label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
59
+ )
60
+
61
+ # 3. Pass an instantiated pipeline
62
+ pipe = pipeline("text-classification", model="lvwerra/distilbert-imdb")
63
+
64
+ eval_results = task_evaluator.compute(
65
+ model_or_pipeline=pipe,
66
+ data=data,
67
+ label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
68
+ )
69
+ print(eval_results)
70
+ ```
71
+ <Tip>
72
+
73
+ Without specifying a device, the default for model inference will be the first GPU on the machine if one is available, and else CPU. If you want to use a specific device you can pass `device` to `compute` where -1 will use the GPU and a positive integer (starting with 0) will use the associated CUDA device.
74
+
75
+ </Tip>
76
+
77
+
78
+ The results will look as follows:
79
+ ```python
80
+ {
81
+ 'accuracy': 0.918,
82
+ 'latency_in_seconds': 0.013,
83
+ 'samples_per_second': 78.887,
84
+ 'total_time_in_seconds': 12.676
85
+ }
86
+ ```
87
+
88
+ Note that evaluation results include both the requested metric, and information about the time it took to obtain predictions through the pipeline.
89
+
90
+ <Tip>
91
+
92
+ The time performances can give useful indication on model speed for inference but should be taken with a grain of salt: they include all the processing that goes on in the pipeline. This may include tokenizing, post-processing, that may be different depending on the model. Furthermore, it depends a lot on the hardware you are running the evaluation on and you may be able to improve the performance by optimizing things like the batch size.
93
+
94
+ </Tip>
95
+
96
+ ### Evaluate multiple metrics
97
+
98
+ With the [`combine`] function one can bundle several metrics into an object that behaves like a single metric. We can use this to evaluate several metrics at once with the evaluator:
99
+
100
+ ```python
101
+ import evaluate
102
+
103
+ eval_results = task_evaluator.compute(
104
+ model_or_pipeline="lvwerra/distilbert-imdb",
105
+ data=data,
106
+ metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
107
+ label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
108
+ )
109
+ print(eval_results)
110
+
111
+ ```
112
+ The results will look as follows:
113
+ ```python
114
+ {
115
+ 'accuracy': 0.918,
116
+ 'f1': 0.916,
117
+ 'precision': 0.9147,
118
+ 'recall': 0.9187,
119
+ 'latency_in_seconds': 0.013,
120
+ 'samples_per_second': 78.887,
121
+ 'total_time_in_seconds': 12.676
122
+ }
123
+ ```
124
+
125
+ Next let's have a look at token classification.
126
+
127
+ ## Token Classification
128
+
129
+ With the token classification evaluator one can evaluate models for tasks such as NER or POS tagging. It has the following specific arguments:
130
+
131
+ - `input_column="text"`: with this argument the column with the data for the pipeline can be specified.
132
+ - `label_column="label"`: with this argument the column with the labels for the evaluation can be specified.
133
+ - `label_mapping=None`: the label mapping aligns the labels in the pipeline output with the labels need for evaluation. E.g. the labels in `label_column` can be integers (`0`/`1`) whereas the pipeline can produce label names such as `"positive"`/`"negative"`. With that dictionary the pipeline outputs are mapped to the labels.
134
+ - `join_by=" "`: While most datasets are already tokenized the pipeline expects a string. Thus the tokens need to be joined before passing to the pipeline. By default they are joined with a whitespace.
135
+
136
+ Let's have a look how we can use the evaluator to benchmark several models.
137
+
138
+ ### Benchmarking several models
139
+
140
+ Here is an example where several models can be compared thanks to the `evaluator` in only a few lines of code, abstracting away the preprocessing, inference, postprocessing, metric computation:
141
+
142
+ ```python
143
+ import pandas as pd
144
+ from datasets import load_dataset
145
+ from evaluate import evaluator
146
+ from transformers import pipeline
147
+
148
+ models = [
149
+ "xlm-roberta-large-finetuned-conll03-english",
150
+ "dbmdz/bert-large-cased-finetuned-conll03-english",
151
+ "elastic/distilbert-base-uncased-finetuned-conll03-english",
152
+ "dbmdz/electra-large-discriminator-finetuned-conll03-english",
153
+ "gunghio/distilbert-base-multilingual-cased-finetuned-conll2003-ner",
154
+ "philschmid/distilroberta-base-ner-conll2003",
155
+ "Jorgeutd/albert-base-v2-finetuned-ner",
156
+ ]
157
+
158
+ data = load_dataset("conll2003", split="validation").shuffle().select(range(1000))
159
+ task_evaluator = evaluator("token-classification")
160
+
161
+ results = []
162
+ for model in models:
163
+ results.append(
164
+ task_evaluator.compute(
165
+ model_or_pipeline=model, data=data, metric="seqeval"
166
+ )
167
+ )
168
+
169
+ df = pd.DataFrame(results, index=models)
170
+ df[["overall_f1", "overall_accuracy", "total_time_in_seconds", "samples_per_second", "latency_in_seconds"]]
171
+ ```
172
+
173
+ The result is a table that looks like this:
174
+
175
+ | model | overall_f1 | overall_accuracy | total_time_in_seconds | samples_per_second | latency_in_seconds |
176
+ |:-------------------------------------------------------------------|-------------:|-------------------:|------------------------:|---------------------:|---------------------:|
177
+ | Jorgeutd/albert-base-v2-finetuned-ner | 0.941 | 0.989 | 4.515 | 221.468 | 0.005 |
178
+ | dbmdz/bert-large-cased-finetuned-conll03-english | 0.962 | 0.881 | 11.648 | 85.850 | 0.012 |
179
+ | dbmdz/electra-large-discriminator-finetuned-conll03-english | 0.965 | 0.881 | 11.456 | 87.292 | 0.011 |
180
+ | elastic/distilbert-base-uncased-finetuned-conll03-english | 0.940 | 0.989 | 2.318 | 431.378 | 0.002 |
181
+ | gunghio/distilbert-base-multilingual-cased-finetuned-conll2003-ner | 0.947 | 0.991 | 2.376 | 420.873 | 0.002 |
182
+ | philschmid/distilroberta-base-ner-conll2003 | 0.961 | 0.994 | 2.436 | 410.579 | 0.002 |
183
+ | xlm-roberta-large-finetuned-conll03-english | 0.969 | 0.882 | 11.996 | 83.359 | 0.012 |
184
+
185
+
186
+ ### Visualizing results
187
+
188
+ You can feed in the `results` list above into the `plot_radar()` function to visualize different aspects of their performance and choose the model that is the best fit, depending on the metric(s) that are relevant to your use case:
189
+
190
+ ```python
191
+ import evaluate
192
+ from evaluate.visualization import radar_plot
193
+
194
+ >>> plot = radar_plot(data=results, model_names=models, invert_range=["latency_in_seconds"])
195
+ >>> plot.show()
196
+ ```
197
+
198
+ <div class="flex justify-center">
199
+ <img src="https://huggingface.co/datasets/evaluate/media/resolve/main/viz.png" width="400"/>
200
+ </div>
201
+
202
+
203
+ Don't forget to specify `invert_range` for metrics for which smaller is better (such as the case for latency in seconds).
204
+
205
+ If you want to save the plot locally, you can use the `plot.savefig()` function with the option `bbox_inches='tight'`, to make sure no part of the image gets cut off.
206
+
207
+
208
+ ## Question Answering
209
+
210
+ With the question-answering evaluator one can evaluate models for QA without needing to worry about the complicated pre- and post-processing that's required for these models. It has the following specific arguments:
211
+
212
+
213
+ - `question_column="question"`: the name of the column containing the question in the dataset
214
+ - `context_column="context"`: the name of the column containing the context
215
+ - `id_column="id"`: the name of the column cointaing the identification field of the question and answer pair
216
+ - `label_column="answers"`: the name of the column containing the answers
217
+ - `squad_v2_format=None`: whether the dataset follows the format of squad_v2 dataset where a question may have no answer in the context. If this parameter is not provided, the format will be automatically inferred.
218
+
219
+ Let's have a look how we can evaluate QA models and compute confidence intervals at the same time.
220
+
221
+ ### Confidence intervals
222
+
223
+ Every evaluator comes with the options to compute confidence intervals using [bootstrapping](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html). Simply pass `strategy="bootstrap"` and set the number of resanmples with `n_resamples`.
224
+
225
+ ```python
226
+ from datasets import load_dataset
227
+ from evaluate import evaluator
228
+
229
+ task_evaluator = evaluator("question-answering")
230
+
231
+ data = load_dataset("squad", split="validation[:1000]")
232
+ eval_results = task_evaluator.compute(
233
+ model_or_pipeline="distilbert-base-uncased-distilled-squad",
234
+ data=data,
235
+ metric="squad",
236
+ strategy="bootstrap",
237
+ n_resamples=30
238
+ )
239
+ ```
240
+
241
+ Results include confidence intervals as well as error estimates as follows:
242
+
243
+ ```python
244
+ {
245
+ 'exact_match':
246
+ {
247
+ 'confidence_interval': (79.67, 84.54),
248
+ 'score': 82.30,
249
+ 'standard_error': 1.28
250
+ },
251
+ 'f1':
252
+ {
253
+ 'confidence_interval': (85.30, 88.88),
254
+ 'score': 87.23,
255
+ 'standard_error': 0.97
256
+ },
257
+ 'latency_in_seconds': 0.0085,
258
+ 'samples_per_second': 117.31,
259
+ 'total_time_in_seconds': 8.52
260
+ }
261
+ ```
262
+
263
+ ## Image classification
264
+
265
+ With the image classification evaluator we can evaluate any image classifier. It uses the same keyword arguments at the text classifier:
266
+
267
+ - `input_column="image"`: the name of the column containing the images as PIL ImageFile
268
+ - `label_column="label"`: the name of the column containing the labels
269
+ - `label_mapping=None`: We want to map class labels defined by the model in the pipeline to values consistent with those defined in the `label_column`
270
+
271
+ Let's have a look at how can evaluate image classification models on large datasets.
272
+
273
+ ### Handling large datasets
274
+
275
+ The evaluator can be used on large datasets! Below, an example shows how to use it on ImageNet-1k for image classification. Beware that this example will require to download ~150 GB.
276
+
277
+ ```python
278
+ data = load_dataset("imagenet-1k", split="validation", use_auth_token=True)
279
+
280
+ pipe = pipeline(
281
+ task="image-classification",
282
+ model="facebook/deit-small-distilled-patch16-224"
283
+ )
284
+
285
+ task_evaluator = evaluator("image-classification")
286
+ eval_results = task_evaluator.compute(
287
+ model_or_pipeline=pipe,
288
+ data=data,
289
+ metric="accuracy",
290
+ label_mapping=pipe.model.config.label2id
291
+ )
292
+ ```
293
+
294
+ Since we are using `datasets` to store data we make use of a technique called memory mappings. This means that the dataset is never fully loaded into memory which saves a lot of RAM. Running the above code only uses roughly 1.5 GB of RAM while the validation split is more than 30 GB big.
evaluate/docs/source/choosing_a_metric.mdx ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Choosing a metric for your task
2
+
3
+ **So you've trained your model and want to see how well it’s doing on a dataset of your choice. Where do you start?**
4
+
5
+ There is no “one size fits all” approach to choosing an evaluation metric, but some good guidelines to keep in mind are:
6
+
7
+ ## Categories of metrics
8
+
9
+ There are 3 high-level categories of metrics:
10
+
11
+ 1. *Generic metrics*, which can be applied to a variety of situations and datasets, such as precision and accuracy.
12
+ 2. *Task-specific metrics*, which are limited to a given task, such as Machine Translation (often evaluated using metrics [BLEU](https://huggingface.co/metrics/bleu) or [ROUGE](https://huggingface.co/metrics/rouge)) or Named Entity Recognition (often evaluated with [seqeval](https://huggingface.co/metrics/seqeval)).
13
+ 3. *Dataset-specific metrics*, which aim to measure model performance on specific benchmarks: for instance, the [GLUE benchmark](https://huggingface.co/datasets/glue) has a dedicated [evaluation metric](https://huggingface.co/metrics/glue).
14
+
15
+ Let's look at each of these three cases:
16
+
17
+ ### Generic metrics
18
+
19
+ Many of the metrics used in the Machine Learning community are quite generic and can be applied in a variety of tasks and datasets.
20
+
21
+ This is the case for metrics like [accuracy](https://huggingface.co/metrics/accuracy) and [precision](https://huggingface.co/metrics/precision), which can be used for evaluating labeled (supervised) datasets, as well as [perplexity](https://huggingface.co/metrics/perplexity), which can be used for evaluating different kinds of (unsupervised) generative tasks.
22
+
23
+ To see the input structure of a given metric, you can look at its metric card. For example, in the case of [precision](https://huggingface.co/metrics/precision), the format is:
24
+ ```
25
+ >>> precision_metric = evaluate.load("precision")
26
+ >>> results = precision_metric.compute(references=[0, 1], predictions=[0, 1])
27
+ >>> print(results)
28
+ {'precision': 1.0}
29
+ ```
30
+
31
+ ### Task-specific metrics
32
+
33
+ Popular ML tasks like Machine Translation and Named Entity Recognition have specific metrics that can be used to compare models. For example, a series of different metrics have been proposed for text generation, ranging from [BLEU](https://huggingface.co/metrics/bleu) and its derivatives such as [GoogleBLEU](https://huggingface.co/metrics/google_bleu) and [GLEU](https://huggingface.co/metrics/gleu), but also [ROUGE](https://huggingface.co/metrics/rouge), [MAUVE](https://huggingface.co/metrics/mauve), etc.
34
+
35
+ You can find the right metric for your task by:
36
+
37
+ - **Looking at the [Task pages](https://huggingface.co/tasks)** to see what metrics can be used for evaluating models for a given task.
38
+ - **Checking out leaderboards** on sites like [Papers With Code](https://paperswithcode.com/) (you can search by task and by dataset).
39
+ - **Reading the metric cards** for the relevant metrics and see which ones are a good fit for your use case. For example, see the [BLEU metric card](https://github.com/huggingface/evaluate/tree/main/metrics/bleu) or [SQuaD metric card](https://github.com/huggingface/evaluate/tree/main/metrics/squad).
40
+ - **Looking at papers and blog posts** published on the topic and see what metrics they report. This can change over time, so try to pick papers from the last couple of years!
41
+
42
+ ### Dataset-specific metrics
43
+
44
+ Some datasets have specific metrics associated with them -- this is especially in the case of popular benchmarks like [GLUE](https://huggingface.co/metrics/glue) and [SQuAD](https://huggingface.co/metrics/squad).
45
+
46
+ <Tip warning={true}>
47
+ 💡
48
+ GLUE is actually a collection of different subsets on different tasks, so first you need to choose the one that corresponds to the NLI task, such as mnli, which is described as “crowdsourced collection of sentence pairs with textual entailment annotations”
49
+ </Tip>
50
+
51
+
52
+ If you are evaluating your model on a benchmark dataset like the ones mentioned above, you can use its dedicated evaluation metric. Make sure you respect the format that they require. For example, to evaluate your model on the [SQuAD](https://huggingface.co/datasets/squad) dataset, you need to feed the `question` and `context` into your model and return the `prediction_text`, which should be compared with the `references` (based on matching the `id` of the question) :
53
+
54
+ ```
55
+ >>> from evaluate import load
56
+ >>> squad_metric = load("squad")
57
+ >>> predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}]
58
+ >>> references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
59
+ >>> results = squad_metric.compute(predictions=predictions, references=references)
60
+ >>> results
61
+ {'exact_match': 100.0, 'f1': 100.0}
62
+ ```
63
+
64
+ You can find examples of dataset structures by consulting the "Dataset Preview" function or the dataset card for a given dataset, and you can see how to use its dedicated evaluation function based on the metric card.
evaluate/docs/source/considerations.mdx ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Considerations for model evaluation
2
+
3
+ Developing an ML model is rarely a one-shot deal: it often involves multiple stages of defining the model architecture and tuning hyper-parameters before converging on a final set. Responsible model evaluation is a key part of this process, and 🤗 Evaluate is here to help!
4
+
5
+ Here are some things to keep in mind when evaluating your model using the 🤗 Evaluate library:
6
+
7
+ ## Properly splitting your data
8
+
9
+ Good evaluation generally requires three splits of your dataset:
10
+
11
+ - **train**: this is used for training your model.
12
+ - **validation**: this is used for validating the model hyperparameters.
13
+ - **test**: this is used for evaluating your model.
14
+
15
+ Many of the datasets on the 🤗 Hub are separated into 2 splits: `train` and `validation`; others are split into 3 splits (`train`, `validation` and `test`) -- make sure to use the right split for the right purpose!
16
+
17
+ Some datasets on the 🤗 Hub are already separated into these three splits. However, there are also many that only have a train/validation or only train split.
18
+
19
+ If the dataset you're using doesn't have a predefined train-test split, it is up to you to define which part of the dataset you want to use for training your model and which you want to use for hyperparameter tuning or final evaluation.
20
+
21
+ <Tip warning={true}>
22
+ Training and evaluating on the same split can misrepresent your results! If you overfit on your training data the evaluation results on that split will look great but the model will perform poorly on new data.
23
+ </Tip>
24
+
25
+ Depending on the size of the dataset, you can keep anywhere from 10-30% for evaluation and the rest for training, while aiming to set up the test set to reflect the production data as close as possible. Check out [this thread](https://discuss.huggingface.co/t/how-to-split-main-dataset-into-train-dev-test-as-datasetdict/1090) for a more in-depth discussion of dataset splitting!
26
+
27
+ ## The impact of class imbalance
28
+
29
+ While many academic datasets, such as the [IMDb dataset](https://huggingface.co/datasets/imdb) of movie reviews, are perfectly balanced, most real-world datasets are not. In machine learning a *balanced dataset* corresponds to a datasets where all labels are represented equally. In the case of the IMDb dataset this means that there are as many positive as negative reviews in the dataset. In an imbalanced dataset this is not the case: in fraud detection for example there are usually many more non-fraud cases than fraud cases in the dataset.
30
+
31
+ Having an imbalanced dataset can skew the results of your metrics. Imagine a dataset with 99 "non-fraud" cases and 1 "fraud" case. A simple model that always predicts "non-fraud" cases would give yield a 99% accuracy which might sound good at first until you realize that you will never catch a fraud case.
32
+
33
+ Often, using more than one metric can help get a better idea of your model’s performance from different points of view. For instance, metrics like **[recall](https://huggingface.co/metrics/recall)** and **[precision](https://huggingface.co/metrics/precision)** can be used together, and the **[f1 score](https://huggingface.co/metrics/f1)** is actually the harmonic mean of the two.
34
+
35
+ In cases where a dataset is balanced, using [accuracy](https://huggingface.co/metrics/accuracy) can reflect the overall model performance:
36
+
37
+ ![Balanced Labels](https://huggingface.co/datasets/evaluate/media/resolve/main/balanced-classes.png)
38
+
39
+ In cases where there is an imbalance, using [F1 score](https://huggingface.co/metrics/f1) can be a better representation of performance, given that it encompasses both precision and recall.
40
+
41
+ ![Imbalanced Labels](https://huggingface.co/datasets/evaluate/media/resolve/main/imbalanced-classes.png)
42
+
43
+ Using accuracy in an imbalanced setting is less ideal, since it is not sensitive to minority classes and will not faithfully reflect model performance on them.
44
+
45
+ ## Offline vs. online model evaluation
46
+
47
+ There are multiple ways to evaluate models, and an important distinction is offline versus online evaluation:
48
+
49
+ **Offline evaluation** is done before deploying a model or using insights generated from a model, using static datasets and metrics.
50
+
51
+ **Online evaluation** means evaluating how a model is performing after deployment and during its use in production.
52
+
53
+ These two types of evaluation can use different metrics and measure different aspects of model performance. For example, offline evaluation can compare a model to other models based on their performance on common benchmarks, whereas online evaluation will evaluate aspects such as latency and accuracy of the model based on production data (for example, the number of user queries that it was able to address).
54
+
55
+ ## Trade-offs in model evaluation
56
+
57
+ When evaluating models in practice, there are often trade-offs that have to be made between different aspects of model performance: for instance, choosing a model that is slightly less accurate but that has a faster inference time, compared to a high-accuracy that has a higher memory footprint and requires access to more GPUs.
58
+
59
+ Here are other aspects of model performance to consider during evaluation:
60
+
61
+ ### Interpretability
62
+
63
+ When evaluating models, **interpretability** (i.e. the ability to *interpret* results) can be very important, especially when deploying models in production.
64
+
65
+ For instance, metrics such as [exact match](https://huggingface.co/spaces/evaluate-metric/exact_match) have a set range (between 0 and 1, or 0% and 100%) and are easily understandable to users: for a pair of strings, the exact match score is 1 if the two strings are the exact same, and 0 otherwise.
66
+
67
+ Other metrics, such as [BLEU](https://huggingface.co/spaces/evaluate-metric/exact_match) are harder to interpret: while they also range between 0 and 1, they can vary greatly depending on which parameters are used to generate the scores, especially when different tokenization and normalization techniques are used (see the [metric card](https://huggingface.co/spaces/evaluate-metric/bleu/blob/main/README.md) for more information about BLEU limitations). This means that it is difficult to interpret a BLEU score without having more information about the procedure used for obtaining it.
68
+
69
+ Interpretability can be more or less important depending on the evaluation use case, but it is a useful aspect of model evaluation to keep in mind, since communicating and comparing model evaluations is an important part of responsible machine learning.
70
+
71
+
72
+ ### Inference speed and memory footprint
73
+
74
+ While recent years have seen increasingly large ML models achieve high performance on a large variety of tasks and benchmarks, deploying these multi-billion parameter models in practice can be a challenge in itself, and many organizations lack the resources for this. This is why considering the **inference speed** and **memory footprint** of models is important, especially when doing online model evaluation.
75
+
76
+ Inference speed refers to the time that it takes for a model to make a prediction -- this will vary depending on the hardware used and the way in which models are queried, e.g. in real time via an API or in batch jobs that run once a day.
77
+
78
+ Memory footprint refers to the size of the model weights and how much hardware memory they occupy. If a model is too large to fit on a single GPU or CPU, then it has to be split over multiple ones, which can be more or less difficult depending on the model architecture and the deployment method.
79
+
80
+ When doing online model evaluation, there is often a trade-off to be done between inference speed and accuracy or precision, whereas this is less the case for offline evaluation.
81
+
82
+ ## Limitations and bias
83
+
84
+ All models and all metrics have their limitations and biases, which depend on the way in which they were trained, the data that was used, and their intended uses. It is important to measure and communicate these limitations clearly to prevent misuse and unintended impacts, for instance via [model cards](https://huggingface.co/course/chapter4/4?fw=pt) which document the training and evaluation process.
85
+
86
+ Measuring biases can be done by evaluating models on datasets such as [Wino Bias](https://huggingface.co/datasets/wino_bias) or [MD Gender Bias](https://huggingface.co/datasets/md_gender_bias), and by doing [Interactive Error Analyis](https://huggingface.co/spaces/nazneen/error-analysis) to try to identify which subsets of the evaluation dataset a model performs poorly on.
87
+
88
+ We are currently working on additional measurements that can be used to quantify different dimensions of bias in both models and datasets -- stay tuned for more documentation on this topic!
evaluate/docs/source/creating_and_sharing.mdx ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Creating and sharing a new evaluation
2
+
3
+ ## Setup
4
+
5
+ Before you can create a new metric make sure you have all the necessary dependencies installed:
6
+
7
+ ```bash
8
+ pip install evaluate[template]
9
+ ```
10
+
11
+ Also make sure your Hugging Face token is registered so you can connect to the Hugging Face Hub:
12
+
13
+ ```bash
14
+ huggingface-cli login
15
+ ```
16
+
17
+ ## Create
18
+
19
+ All evaluation modules, be it metrics, comparisons, or measurements live on the 🤗 Hub in a [Space](https://huggingface.co/docs/hub/spaces) (see for example [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)). In principle, you could setup a new Space and add a new module following the same structure. However, we added a CLI that makes creating a new evaluation module much easier:
20
+
21
+ ```bash
22
+ evaluate-cli create "My Metric" --module_type "metric"
23
+ ```
24
+
25
+ This will create a new Space on the 🤗 Hub, clone it locally, and populate it with a template. Instructions on how to fill the template will be displayed in the terminal, but are also explained here in more detail.
26
+
27
+ For more information about Spaces, see the [Spaces documentation](https://huggingface.co/docs/hub/spaces).
28
+
29
+ ## Module script
30
+
31
+ The evaluation module script (the file with suffix `*.py`) is the core of the new module and includes all the code for computing the evaluation.
32
+
33
+ ### Attributes
34
+
35
+ Start by adding some information about your evalution module in [`EvaluationModule._info`]. The most important attributes you should specify are:
36
+
37
+ 1. [`EvaluationModuleInfo.description`] provides a brief description about your evalution module.
38
+
39
+ 2. [`EvaluationModuleInfo.citation`] contains a BibTex citation for the evalution module.
40
+
41
+ 3. [`EvaluationModuleInfo.inputs_description`] describes the expected inputs and outputs. It may also provide an example usage of the evalution module.
42
+
43
+ 4. [`EvaluationModuleInfo.features`] defines the name and type of the predictions and references. This has to be either a single `datasets.Features` object or a list of `datasets.Features` objects if multiple input types are allowed.
44
+
45
+ Then, we can move on to prepare everything before the actual computation.
46
+
47
+ ### Download
48
+
49
+ Some evaluation modules require some external data such as NLTK that requires resources or the BLEURT metric that requires checkpoints. You can implement these downloads in [`EvaluationModule._download_and_prepare`], which downloads and caches the resources via the `dlmanager`. A simplified example on how BLEURT downloads and loads a checkpoint:
50
+
51
+ ```py
52
+ def _download_and_prepare(self, dl_manager):
53
+ model_path = dl_manager.download_and_extract(CHECKPOINT_URLS[self.config_name])
54
+ self.scorer = score.BleurtScorer(os.path.join(model_path, self.config_name))
55
+ ```
56
+
57
+ Or if you need to download the NLTK `"punkt"` resources:
58
+
59
+ ```py
60
+ def _download_and_prepare(self, dl_manager):
61
+ import nltk
62
+ nltk.download("punkt")
63
+ ```
64
+
65
+ Next, we need to define how the computation of the evaluation module works.
66
+
67
+ ### Compute
68
+
69
+ The computation is performed in the [`EvaluationModule._compute`] method. It takes the same arguments as `EvaluationModuleInfo.features` and should then return the result as a dictionary. Here an example of an exact match metric:
70
+
71
+
72
+ ```py
73
+ def _compute(self, references, predictions):
74
+ em = sum([r==p for r, p in zip(references, predictions)])/len(references)
75
+ return {"exact_match": em}
76
+ ```
77
+
78
+ This method is used when you call `.compute()` later on.
79
+
80
+ ## Readme
81
+
82
+ When you use the `evalute-cli` to setup the evaluation module the Readme structure and instructions are automatically created. It should include a general description of the metric, information about its input/output format, examples as well as information about its limiations or biases and references.
83
+
84
+ ## Requirements
85
+
86
+ If your evaluation modules has additional dependencies (e.g. `sklearn` or `nltk`) the `requirements.txt` files is the place to put them. The file follows the `pip` format and you can list all dependencies there.
87
+
88
+ ## App
89
+
90
+ The `app.py` is where the Spaces widget lives. In general it looks like the following and does not require any changes:
91
+
92
+ ```py
93
+ import evaluate
94
+ from evaluate.utils import launch_gradio_widget
95
+
96
+
97
+ module = evaluate.load("lvwerra/element_count")
98
+ launch_gradio_widget(module)
99
+ ```
100
+
101
+ If you want a custom widget you could add your gradio app here.
102
+
103
+ ## Push to Hub
104
+
105
+ Finally, when you are done with all the above changes it is time to push your evaluation module to the hub. To do so navigate to the folder of your module and git add/commit/push the changes to the hub:
106
+
107
+ ```
108
+ cd PATH_TO_MODULE
109
+ git add .
110
+ git commit -m "Add my new, shiny module."
111
+ git push
112
+ ```
113
+ Tada 🎉! Your evaluation module is now on the 🤗 Hub and ready to be used by everybody!
evaluate/docs/source/custom_evaluator.mdx ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Using the `evaluator` with custom pipelines
2
+
3
+ The evaluator is designed to work with `transformer` pipelines out-of-the-box. However, in many cases you might have a model or pipeline that's not part of the `transformer` ecosystem. You can still use `evaluator` to easily compute metrics for them. In this guide we show how to do this for a Scikit-Learn [pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline) and a Spacy [pipeline](https://spacy.io). Let's start with the Scikit-Learn case.
4
+
5
+ ## Scikit-Learn
6
+
7
+ First we need to train a model. We'll train a simple text classifier on the [IMDb dataset](https://huggingface.co/datasets/imdb), so let's start by downloading the dataset:
8
+
9
+ ```py
10
+ from datasets import load_dataset
11
+
12
+ ds = load_dataset("imdb")
13
+ ```
14
+
15
+ Then we can build a simple TF-IDF preprocessor and Naive Bayes classifier wrapped in a `Pipeline`:
16
+
17
+ ```py
18
+ from sklearn.pipeline import Pipeline
19
+ from sklearn.naive_bayes import MultinomialNB
20
+ from sklearn.feature_extraction.text import TfidfTransformer
21
+ from sklearn.feature_extraction.text import CountVectorizer
22
+
23
+ text_clf = Pipeline([
24
+ ('vect', CountVectorizer()),
25
+ ('tfidf', TfidfTransformer()),
26
+ ('clf', MultinomialNB()),
27
+ ])
28
+
29
+ text_clf.fit(ds["train"]["text"], ds["train"]["label"])
30
+ ```
31
+
32
+ Following the convention in the `TextClassificationPipeline` of `transformers` our pipeline should be callable and return a list of dictionaries. In addition we use the `task` attribute to check if the pipeline is compatible with the `evaluator`. We can write a small wrapper class for that purpose:
33
+
34
+ ```py
35
+ class ScikitEvalPipeline:
36
+ def __init__(self, pipeline):
37
+ self.pipeline = pipeline
38
+ self.task = "text-classification"
39
+
40
+ def __call__(self, input_texts, **kwargs):
41
+ return [{"label": p} for p in self.pipeline.predict(input_texts)]
42
+
43
+ pipe = ScikitEvalPipeline(text_clf)
44
+ ```
45
+
46
+ We can now pass this `pipeline` to the `evaluator`:
47
+
48
+ ```py
49
+ from evaluate import evaluator
50
+
51
+ task_evaluator = evaluator("text-classification")
52
+ task_evaluator.compute(pipe, ds["test"], "accuracy")
53
+
54
+ >>> {'accuracy': 0.82956}
55
+ ```
56
+
57
+ Implementing that simple wrapper is all that's needed to use any model from any framework with the `evaluator`. In the `__call__` you can implement all logic necessary for efficient forward passes through your model.
58
+
59
+ ## Spacy
60
+
61
+ We'll use the `polarity` feature of the `spacytextblob` project to get a simple sentiment analyzer. First you'll need to install the project and download the resources:
62
+
63
+ ```bash
64
+ pip install spacytextblob
65
+ python -m textblob.download_corpora
66
+ python -m spacy download en_core_web_sm
67
+ ```
68
+
69
+ Then we can simply load the `nlp` pipeline and add the `spacytextblob` pipeline:
70
+ ```py
71
+ import spacy
72
+
73
+ nlp = spacy.load('en_core_web_sm')
74
+ nlp.add_pipe('spacytextblob')
75
+ ```
76
+
77
+ This snippet shows how we can use the `polarity` feature added with `spacytextblob` to get the sentiment of a text:
78
+
79
+ ```py
80
+ texts = ["This movie is horrible", "This movie is awesome"]
81
+ results = nlp.pipe(texts)
82
+
83
+ for txt, res in zip(texts, results):
84
+ print(f"{text} | Polarity: {res._.blob.polarity}")
85
+ ```
86
+
87
+ Now we can wrap it in a simple wrapper class like in the Scikit-Learn example before. It just has to return a list of dictionaries with the predicted lables. If the polarity is larger than 0 we'll predict positive sentiment and negative otherwise:
88
+
89
+ ```py
90
+ class SpacyEvalPipeline:
91
+ def __init__(self, nlp):
92
+ self.nlp = nlp
93
+ self.task = "text-classification"
94
+
95
+ def __call__(self, input_texts, **kwargs):
96
+ results =[]
97
+ for p in self.nlp.pipe(input_texts):
98
+ if p._.blob.polarity>=0:
99
+ results.append({"label": 1})
100
+ else:
101
+ results.append({"label": 0})
102
+ return results
103
+
104
+ pipe = SpacyEvalPipeline(nlp)
105
+ ```
106
+
107
+ That class is compatible with the `evaluator` and we can use the same instance from the previous examlpe along with the IMDb test set:
108
+
109
+ ```py
110
+ eval.compute(pipe, ds["test"], "accuracy")
111
+ >>> {'accuracy': 0.6914}
112
+ ```
113
+
114
+ This will take a little longer than the Scikit-Learn example but after roughly 10-15min you will have the evaluation results!
evaluate/docs/source/evaluation_suite.mdx ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Creating an EvaluationSuite
2
+
3
+ It can be useful to evaluate models on a variety of different tasks to understand their downstream performance. Assessing the model on several types of tasks can reveal gaps in performance along some axis. For example, when training a language model, it is often useful to measure perplexity on an in-domain corpus, but also to concurrently evaluate on tasks which test for general language capabilities like natural language entailment or question-answering, or tasks designed to probe the model along fairness and bias dimensions.
4
+
5
+ The `EvaluationSuite` provides a way to compose any number of ([evaluator](base_evaluator), dataset, metric) tuples as a SubTask to evaluate a model on a collection of several evaluation tasks. See the [evaluator documentation](base_evaluator) for a list of currently supported tasks.
6
+
7
+ A new `EvaluationSuite` is made up of a list of `SubTask` classes, each defining an evaluation task. The Python file containing the definition can be uploaded to a Space on the Hugging Face Hub so it can be shared with the community or saved/loaded locally as a Python script.
8
+
9
+ Some datasets require additional preprocessing before passing them to an `Evaluator`. You can set a `data_preprocessor` for each `SubTask` which is applied via a `map` operation using the `datasets` library. Keyword arguments for the `Evaluator` can be passed down through the `args_for_task` attribute.
10
+
11
+ To create a new `EvaluationSuite`, create a [new Space](https://huggingface.co/new-space) with a .py file which matches the name of the Space, add the below template to a Python file, and fill in the attributes for a new task.
12
+
13
+ The mandatory attributes for a new `SubTask` are `task_type` and `data`.
14
+ 1. [`task_type`] maps to the tasks currently supported by the Evaluator.
15
+ 2. [`data`] can be an instantiated Hugging Face dataset object or the name of a dataset.
16
+ 3. [`subset`] and [`split`] can be used to define which name and split of the dataset should be used for evaluation.
17
+ 4. [`args_for_task`] should be a dictionary with kwargs to be passed to the Evaluator.
18
+
19
+ ```python
20
+ import evaluate
21
+ from evaluate.evaluation_suite import SubTask
22
+
23
+ class Suite(evaluate.EvaluationSuite):
24
+
25
+ def __init__(self, name):
26
+ super().__init__(name)
27
+ self.preprocessor = lambda x: {"text": x["text"].lower()}
28
+ self.suite = [
29
+ SubTask(
30
+ task_type="text-classification",
31
+ data="glue",
32
+ subset="sst2",
33
+ split="validation[:10]",
34
+ args_for_task={
35
+ "metric": "accuracy",
36
+ "input_column": "sentence",
37
+ "label_column": "label",
38
+ "label_mapping": {
39
+ "LABEL_0": 0.0,
40
+ "LABEL_1": 1.0
41
+ }
42
+ }
43
+ ),
44
+ SubTask(
45
+ task_type="text-classification",
46
+ data="glue",
47
+ subset="rte",
48
+ split="validation[:10]",
49
+ args_for_task={
50
+ "metric": "accuracy",
51
+ "input_column": "sentence1",
52
+ "second_input_column": "sentence2",
53
+ "label_column": "label",
54
+ "label_mapping": {
55
+ "LABEL_0": 0,
56
+ "LABEL_1": 1
57
+ }
58
+ }
59
+ )
60
+ ]
61
+ ```
62
+
63
+ An `EvaluationSuite` can be loaded by name from the Hugging Face Hub, or locally by providing a path, and run with the `run(model_or_pipeline)` method. The evaluation results are returned along with their task names and information about the time it took to obtain predictions through the pipeline. These can be easily displayed with a `pandas.DataFrame`:
64
+
65
+ ```
66
+ >>> from evaluate import EvaluationSuite
67
+ >>> suite = EvaluationSuite.load('mathemakitten/glue-evaluation-suite')
68
+ >>> results = suite.run("gpt2")
69
+ ```
70
+
71
+ | accuracy | total_time_in_seconds | samples_per_second | latency_in_seconds | task_name |
72
+ |-----------:|------------------------:|---------------------:|---------------------:|:------------|
73
+ | 0.5 | 0.740811 | 13.4987 | 0.0740811 | glue/sst2 |
74
+ | 0.4 | 1.67552 | 5.9683 | 0.167552 | glue/rte |
evaluate/docs/source/index.mdx ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <br>
3
+ <img src="https://huggingface.co/datasets/evaluate/media/resolve/main/evaluate-banner.png" width="400"/>
4
+ <br>
5
+ </p>
6
+
7
+ # 🤗 Evaluate
8
+
9
+ A library for easily evaluating machine learning models and datasets.
10
+
11
+ With a single line of code, you get access to dozens of evaluation methods for different domains (NLP, Computer Vision, Reinforcement Learning, and more!). Be it on your local machine or in a distributed training setup, you can evaluate your models in a consistent and reproducible way!
12
+
13
+ Visit the 🤗 Evaluate [organization](https://huggingface.co/evaluate-metric) for a full list of available metrics. Each metric has a dedicated Space with an interactive demo for how to use the metric, and a documentation card detailing the metrics limitations and usage.
14
+
15
+ <div class="mt-10">
16
+ <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
17
+ <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./installation"
18
+ ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Tutorials</div>
19
+ <p class="text-gray-700">Learn the basics and become familiar with loading, computing, and saving with 🤗 Evaluate. Start here if you are using 🤗 Evaluate for the first time!</p>
20
+ </a>
21
+ <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./choosing_a_metric"
22
+ ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">How-to guides</div>
23
+ <p class="text-gray-700">Practical guides to help you achieve a specific goal. Take a look at these guides to learn how to use 🤗 Evaluate to solve real-world problems.</p>
24
+ </a>
25
+ <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./types_of_evaluations"
26
+ ><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Conceptual guides</div>
27
+ <p class="text-gray-700">High-level explanations for building a better understanding of important topics such as considerations going into evaluating a model or dataset and the difference between metrics, measurements, and comparisons.</p>
28
+ </a>
29
+ <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./package_reference/main_classes"
30
+ ><div class="w-full text-center bg-gradient-to-br from-purple-400 to-purple-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Reference</div>
31
+ <p class="text-gray-700">Technical descriptions of how 🤗 Evaluate classes and methods work.</p>
32
+ </a>
33
+ </div>
34
+ </div>
evaluate/docs/source/installation.mdx ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Installation
2
+
3
+ Before you start, you will need to setup your environment and install the appropriate packages. 🤗 Evaluate is tested on **Python 3.7+**.
4
+
5
+ ## Virtual environment
6
+
7
+ You should install 🤗 Evaluate in a [virtual environment](https://docs.python.org/3/library/venv.html) to keep everything neat and tidy.
8
+
9
+ 1. Create and navigate to your project directory:
10
+
11
+ ```bash
12
+ mkdir ~/my-project
13
+ cd ~/my-project
14
+ ```
15
+
16
+ 2. Start a virtual environment inside the directory:
17
+
18
+ ```bash
19
+ python -m venv .env
20
+ ```
21
+
22
+ 3. Activate and deactivate the virtual environment with the following commands:
23
+
24
+ ```bash
25
+ # Activate the virtual environment
26
+ source .env/bin/activate
27
+
28
+ # Deactivate the virtual environment
29
+ source .env/bin/deactivate
30
+ ```
31
+
32
+ Once you have created your virtual environment, you can install 🤗 Evaluate in it.
33
+
34
+ ## pip
35
+
36
+ The most straightforward way to install 🤗 Evaluate is with pip:
37
+
38
+ ```bash
39
+ pip install evaluate
40
+ ```
41
+
42
+ Run the following command to check if 🤗 Evaluate has been properly installed:
43
+
44
+ ```bash
45
+ python -c "import evaluate; print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hello']))"
46
+ ```
47
+
48
+ This should return:
49
+
50
+ ```bash
51
+ {'exact_match': 1.0}
52
+ ```
53
+
54
+ ## source
55
+
56
+ Building 🤗 Evaluate from source lets you make changes to the code base. To install from source, clone the repository and install with the following commands:
57
+
58
+ ```bash
59
+ git clone https://github.com/huggingface/evaluate.git
60
+ cd evaluate
61
+ pip install -e .
62
+ ```
63
+
64
+ Again, you can check if 🤗 Evaluate has been properly installed with:
65
+
66
+ ```bash
67
+ python -c "import evaluate; print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hello']))"
68
+ ```
evaluate/docs/source/keras_integrations.md ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Working with Keras and Tensorflow
2
+
3
+
4
+
5
+ Evaluate can be easily intergrated into your Keras and Tensorflow workflow. We'll demonstrate two ways of incorporating Evaluate into model training, using the Fashion MNIST example dataset. We'll train a standard classifier to predict two classes from this dataset, and show how to use a metric as a callback during training or afterwards for evaluation.
6
+
7
+
8
+ ```python
9
+ import numpy as np
10
+ from tensorflow import keras
11
+ from tensorflow.keras import layers
12
+ import evaluate
13
+
14
+ # We pull example code from Keras.io's guide on classifying with MNIST
15
+ # Located here: https://keras.io/examples/vision/mnist_convnet/
16
+
17
+ # Model / data parameters
18
+ input_shape = (28, 28, 1)
19
+
20
+ # Load the data and split it between train and test sets
21
+ (x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
22
+
23
+
24
+ # Only select tshirts/tops and trousers, classes 0 and 1
25
+ def get_tshirts_tops_and_trouser(x_vals, y_vals):
26
+ mask = np.where((y_vals == 0) | (y_vals == 1))
27
+ return x_vals[mask], y_vals[mask]
28
+
29
+ x_train, y_train = get_tshirts_tops_and_trouser(x_train, y_train)
30
+ x_test, y_test = get_tshirts_tops_and_trouser(x_test, y_test)
31
+
32
+
33
+ # Scale images to the [0, 1] range
34
+ x_train = x_train.astype("float32") / 255
35
+ x_test = x_test.astype("float32") / 255
36
+
37
+ x_train = np.expand_dims(x_train, -1)
38
+ x_test = np.expand_dims(x_test, -1)
39
+
40
+
41
+ model = keras.Sequential(
42
+ [
43
+ keras.Input(shape=input_shape),
44
+ layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
45
+ layers.MaxPooling2D(pool_size=(2, 2)),
46
+ layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
47
+ layers.MaxPooling2D(pool_size=(2, 2)),
48
+ layers.Flatten(),
49
+ layers.Dropout(0.5),
50
+ layers.Dense(1, activation="sigmoid"),
51
+ ]
52
+ )
53
+ ```
54
+
55
+ ## Callbacks
56
+
57
+ Suppose we want to keep track of model metrics while a model is training. We can use a Callback in order to calculate this metric during training, after an epoch ends.
58
+
59
+ We'll define a callback here that will take a metric name and our training data, and have it calculate a metric after the epoch ends.
60
+
61
+
62
+ ```python
63
+ class MetricsCallback(keras.callbacks.Callback):
64
+
65
+ def __init__(self, metric_name, x_data, y_data) -> None:
66
+ super(MetricsCallback, self).__init__()
67
+
68
+ self.x_data = x_data
69
+ self.y_data = y_data
70
+ self.metric_name = metric_name
71
+ self.metric = evaluate.load(metric_name)
72
+
73
+ def on_epoch_end(self, epoch, logs=dict()):
74
+ m = self.model
75
+ # Ensure we get labels of "1" or "0"
76
+ training_preds = np.round(m.predict(self.x_data))
77
+ training_labels = self.y_data
78
+
79
+ # Compute score and save
80
+ score = self.metric.compute(predictions = training_preds, references = training_labels)
81
+
82
+ logs.update(score)
83
+ ```
84
+
85
+ We can pass this class to the `callbacks` keyword-argument to use it during training:
86
+
87
+
88
+ ```python
89
+ batch_size = 128
90
+ epochs = 2
91
+
92
+ model.compile(loss="binary_crossentropy", optimizer="adam")
93
+
94
+ model_history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1,
95
+ callbacks = [MetricsCallback(x_data = x_train, y_data = y_train, metric_name = "accuracy")])
96
+ ```
97
+
98
+ ## Using an Evaluate Metric for... Evaluation!
99
+
100
+ We can also use the same metric after model training! Here, we show how to check accuracy of the model after training on the test set:
101
+
102
+
103
+ ```python
104
+ acc = evaluate.load("accuracy")
105
+ # Round the predictions to turn them into "0" or "1" labels
106
+ test_preds = np.round(model.predict(x_test))
107
+ test_labels = y_test
108
+ ```
109
+
110
+ ```python
111
+ print("Test accuracy is : ", acc.compute(predictions = test_preds, references = test_labels))
112
+ # Test accuracy is : 0.9855
113
+ ```
evaluate/docs/source/package_reference/evaluator_classes.mdx ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluator
2
+
3
+ The evaluator classes for automatic evaluation.
4
+
5
+ ## Evaluator classes
6
+
7
+ The main entry point for using the evaluator:
8
+
9
+ [[autodoc]] evaluate.evaluator
10
+
11
+ The base class for all evaluator classes:
12
+
13
+ [[autodoc]] evaluate.Evaluator
14
+
15
+ ## The task specific evaluators
16
+
17
+ ### ImageClassificationEvaluator
18
+
19
+ [[autodoc]] evaluate.ImageClassificationEvaluator
20
+
21
+ ### QuestionAnsweringEvaluator
22
+
23
+ [[autodoc]] evaluate.QuestionAnsweringEvaluator
24
+ - compute
25
+
26
+ ### TextClassificationEvaluator
27
+
28
+ [[autodoc]] evaluate.TextClassificationEvaluator
29
+
30
+ ### TokenClassificationEvaluator
31
+
32
+ [[autodoc]] evaluate.TokenClassificationEvaluator
33
+ - compute
34
+
35
+ ### TextGenerationEvaluator
36
+
37
+ [[autodoc]] evaluate.TextGenerationEvaluator
38
+ - compute
39
+
40
+ ### Text2TextGenerationEvaluator
41
+
42
+ [[autodoc]] evaluate.Text2TextGenerationEvaluator
43
+ - compute
44
+
45
+ ### SummarizationEvaluator
46
+
47
+ [[autodoc]] evaluate.SummarizationEvaluator
48
+ - compute
49
+
50
+ ### TranslationEvaluator
51
+
52
+ [[autodoc]] evaluate.TranslationEvaluator
53
+ - compute
54
+
55
+ ### AutomaticSpeechRecognitionEvaluator
56
+
57
+ [[autodoc]] evaluate.AutomaticSpeechRecognitionEvaluator
58
+ - compute
59
+
60
+ ### AudioClassificationEvaluator
61
+
62
+ [[autodoc]] evaluate.AudioClassificationEvaluator
63
+ - compute