Add new CrossEncoder model

Browse files

Files changed (5) hide show

README.md +66 -66
config.json +46 -46
onnx/model.onnx +3 -0
special_tokens_map.json +37 -37
tokenizer_config.json +58 -57

README.md CHANGED Viewed

@@ -1,67 +1,67 @@
----
-license: apache-2.0
-pipeline_tag: text-ranking
-language:
-- en
-library_name: sentence-transformers
-base_model:
-- google/electra-base-discriminator
-tags:
-- transformers
----
-## Cross-Encoder for Text Ranking
-This model is a port of the [webis/monoelectra-base](https://huggingface.co/webis/monoelectra-base) model from [lightning-ir](https://github.com/webis-de/lightning-ir) to [Sentence Transformers](https://sbert.net/) and [Transformers](https://huggingface.co/docs/transformers).
-The original model was introduced in the paper [A Systematic Investigation of Distilling Large Language Models into Cross-Encoders for Passage Re-ranking](https://arxiv.org/abs/2405.07920). See https://github.com/webis-de/rank-distillm for code used to train the original model.
-The model can be used as a reranker in a 2-stage "retrieve-rerank" pipeline, where it reorders passages returned by a retriever model (e.g. an embedding model or BM25) given some query. See [SBERT.net Retrieve & Re-rank](https://www.sbert.net/examples/applications/retrieve_rerank/README.html) for more details.
-## Usage with Sentence Transformers
-The usage is easy when you have [SentenceTransformers](https://www.sbert.net/) installed.
-```bash
-pip install sentence-transformers
-```
-Then you can use the pre-trained model like this:
-```python
-from sentence_transformers import CrossEncoder
-model = CrossEncoder("cross-encoder/monoelectra-base", trust_remote_code=True)
-scores = model.predict([
-    ("How many people live in Berlin?", "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers."),
-    ("How many people live in Berlin?", "Berlin is well known for its museums."),
-])
-print(scores)
-# [ 8.122868 -4.292924]
-```
-## Usage with Transformers
-```python
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import torch
-model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/monoelectra-base", trust_remote_code=True)
-tokenizer = AutoTokenizer.from_pretrained("cross-encoder/monoelectra-base")
-features = tokenizer(
-    [
-        ("How many people live in Berlin?", "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers."),
-        ("How many people live in Berlin?", "Berlin is well known for its museums."),
-    ],
-    padding=True,
-    truncation=True,
-    return_tensors="pt",
-)
-model.eval()
-with torch.no_grad():
-    scores = model(**features).logits.view(-1)
-print(scores)
-# tensor([ 8.1229, -4.2929])
 ```

+---
+license: apache-2.0
+pipeline_tag: text-ranking
+language:
+- en
+library_name: sentence-transformers
+base_model:
+- google/electra-base-discriminator
+tags:
+- transformers
+---
+## Cross-Encoder for Text Ranking
+This model is a port of the [webis/monoelectra-base](https://huggingface.co/webis/monoelectra-base) model from [lightning-ir](https://github.com/webis-de/lightning-ir) to [Sentence Transformers](https://sbert.net/) and [Transformers](https://huggingface.co/docs/transformers).
+The original model was introduced in the paper [A Systematic Investigation of Distilling Large Language Models into Cross-Encoders for Passage Re-ranking](https://arxiv.org/abs/2405.07920). See https://github.com/webis-de/rank-distillm for code used to train the original model.
+The model can be used as a reranker in a 2-stage "retrieve-rerank" pipeline, where it reorders passages returned by a retriever model (e.g. an embedding model or BM25) given some query. See [SBERT.net Retrieve & Re-rank](https://www.sbert.net/examples/applications/retrieve_rerank/README.html) for more details.
+## Usage with Sentence Transformers
+The usage is easy when you have [SentenceTransformers](https://www.sbert.net/) installed.
+```bash
+pip install sentence-transformers
+```
+Then you can use the pre-trained model like this:
+```python
+from sentence_transformers import CrossEncoder
+model = CrossEncoder("cross-encoder/monoelectra-base", trust_remote_code=True)
+scores = model.predict([
+    ("How many people live in Berlin?", "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers."),
+    ("How many people live in Berlin?", "Berlin is well known for its museums."),
+])
+print(scores)
+# [ 8.122868 -4.292924]
+```
+## Usage with Transformers
+```python
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/monoelectra-base", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("cross-encoder/monoelectra-base")
+features = tokenizer(
+    [
+        ("How many people live in Berlin?", "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers."),
+        ("How many people live in Berlin?", "Berlin is well known for its museums."),
+    ],
+    padding=True,
+    truncation=True,
+    return_tensors="pt",
+)
+model.eval()
+with torch.no_grad():
+    scores = model(**features).logits.view(-1)
+print(scores)
+# tensor([ 8.1229, -4.2929])
 ```

config.json CHANGED Viewed

@@ -1,46 +1,46 @@
-{
-  "architectures": [
-    "WebisCrossEncoderForSequenceClassification"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "auto_map": {
-    "AutoModelForSequenceClassification": "modeling.WebisCrossEncoderForSequenceClassification"
-  },
-  "backbone_model_type": "electra",
-  "classifier_dropout": null,
-  "doc_length": 256,
-  "embedding_size": 768,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "id2label": {
-    "0": "LABEL_0"
-  },
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "label2id": {
-    "LABEL_0": 0
-  },
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "electra",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 0,
-  "pooling_strategy": "first",
-  "position_embedding_type": "absolute",
-  "query_length": 32,
-  "sentence_transformers": {
-    "activation_fn": "torch.nn.modules.linear.Identity",
-    "version": "4.0.1"
-  },
-  "summary_activation": "gelu",
-  "summary_last_dropout": 0.1,
-  "summary_type": "first",
-  "summary_use_proj": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.49.0",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "vocab_size": 30522
-}

+{
+  "architectures": [
+    "WebisCrossEncoderForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoModelForSequenceClassification": "cross-encoder/monoelectra-base--modeling.WebisCrossEncoderForSequenceClassification"
+  },
+  "backbone_model_type": "electra",
+  "classifier_dropout": null,
+  "doc_length": 256,
+  "embedding_size": 768,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "electra",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooling_strategy": "first",
+  "position_embedding_type": "absolute",
+  "query_length": 32,
+  "sentence_transformers": {
+    "activation_fn": "torch.nn.modules.linear.Identity",
+    "version": "4.1.0.dev0"
+  },
+  "summary_activation": "gelu",
+  "summary_last_dropout": 0.1,
+  "summary_type": "first",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.0.dev0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aef62e9a40e6a7e84a1b5e3c5e20c2aea3d8dc6a67bb9ed6581f0eb91823b6a0
+size 438212375

special_tokens_map.json CHANGED Viewed

@@ -1,37 +1,37 @@
-{
-  "cls_token": {
-    "content": "[CLS]",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "mask_token": {
-    "content": "[MASK]",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": {
-    "content": "[PAD]",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "sep_token": {
-    "content": "[SEP]",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "[UNK]",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json CHANGED Viewed

@@ -1,57 +1,58 @@
-{
-  "added_tokens_decoder": {
-    "0": {
-      "content": "[PAD]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "100": {
-      "content": "[UNK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "101": {
-      "content": "[CLS]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "102": {
-      "content": "[SEP]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "103": {
-      "content": "[MASK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "clean_up_tokenization_spaces": true,
-  "cls_token": "[CLS]",
-  "do_lower_case": true,
-  "doc_length": 256,
-  "mask_token": "[MASK]",
-  "model_max_length": 512,
-  "pad_token": "[PAD]",
-  "query_length": 32,
-  "sep_token": "[SEP]",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "ElectraTokenizer",
-  "unk_token": "[UNK]"
-}

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "doc_length": 256,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "query_length": 32,
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "ElectraTokenizer",
+  "unk_token": "[UNK]"
+}