Add model

Files changed (12) hide show

1_Pooling/config.json +10 -0
README.md +109 -3
biencoder_config.json +3 -0
config.json +26 -0
config_sentence_transformers.json +12 -0
model.safetensors +3 -0
modules.json +20 -0
sentence_bert_config.json +4 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +56 -0
vocab.txt +0 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

README.md CHANGED Viewed

@@ -1,3 +1,109 @@
----
-license: apache-2.0
----

+---
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- arctic
+license: cc-by-nc-4.0
+---
+# E5 Base, Arctic Edition
+This model is the result of the Arctic Embed [walkthrough example](https://github.com/snowflakedb/ArcticTraining/blob/main/projects/arctic_embed/examples/finetune_models/README.md) for training embedding models using the [open-source Arctic Embed codebase](https://github.com/snowflakedb/ArcticTraining/blob/main/projects/arctic_embed/). In the walkthrough, we fine-tune the [`e5-base-unsupervised`](https://huggingface.co/intfloat/e5-base-unsupervised) using an improved dataset that leverages modern hard-negative mining practices and includes three more high-quality retrieval datasets than the original E5 finetuning pipeline.
+| Model               |   BEIR Score (nDCG@10) |   CLEF English (nDCG@10) |
+|:--------------------|-----------------------:|-------------------------:|
+| e5-base-v2          |                  50.19 |                    45.38 |
+| arctic-e5-base      |                  54.70 |                    52.77 |
+| gte-base-en-v1.5    |                  54.02 |                    47.91 |
+| arctic-embed-m-v1.0 |                  54.89 |                    47.62 |
+| arctic-embed-m-v2.0 |                  55.38 |                    54.06 |
+**NOTE: This model was trained as an example and heavily leverages in-domain datasets from the data sources used by the BEIR benchmark. Though it performs well on the CLEF English dataset, it may be substantially overfit to the domains of the BEIR benchmark and may not generalize well to certain applications.**
+## Usage
+### Using Sentence Transformers
+You can use the sentence-transformers package to use an snowflake-arctic-embed model, as shown below.
+```python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer("Snowflake/snowflake-arctic-e5-base")
+queries = ['what is snowflake?', 'Where can I get the best tacos?']
+documents = ['The Data Cloud!', 'Mexico City of Course!']
+query_embeddings = model.encode(queries, prompt_name="query")
+document_embeddings = model.encode(documents)
+scores = query_embeddings @ document_embeddings.T
+for query, query_scores in zip(queries, scores):
+    doc_score_pairs = list(zip(documents, query_scores))
+    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
+    # Output passages & scores
+    print("Query:", query)
+    for document, score in doc_score_pairs:
+        print(score, document)
+```
+Produces:
+```
+Query: what is snowflake?
+0.2747492 The Data Cloud!
+0.19998045 Mexico City of Course!
+Query: Where can I get the best tacos?
+0.29974818 Mexico City of Course!
+0.2344071 The Data Cloud!
+```
+### Using Huggingface transformers
+You can use the transformers package to use the model, as shown below. For optimal retrieval quality, use the CLS token to embed each text portion (not mean pooling) and use the standard E5 query and document prefixes below.
+```python
+import torch
+from transformers import AutoModel, AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained('Snowflake/snowflake-arctic-e5-base')
+model = AutoModel.from_pretrained('Snowflake/snowflake-arctic-e5-base')
+model.eval()
+query_prefix = 'query: '
+queries  = ['what is snowflake?', 'Where can I get the best tacos?']
+queries_with_prefix = ["{}{}".format(query_prefix, q) for q in queries]
+query_tokens = tokenizer(queries_with_prefix, padding=True, truncation=True, return_tensors='pt', max_length=512)
+document_prefix = 'passage: '
+documents = ['The Data Cloud!', 'Mexico City of Course!']
+documents_with_prefix = ["{}{}".format(document_prefix, d) for d in documents]
+document_tokens =  tokenizer(documents_with_prefix, padding=True, truncation=True, return_tensors='pt', max_length=512)
+# Compute token embeddings
+with torch.inference_mode():
+    query_embeddings = model(**query_tokens)[0][:, 0]
+    document_embeddings = model(**document_tokens)[0][:, 0]
+# normalize embeddings
+query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)
+document_embeddings = torch.nn.functional.normalize(document_embeddings, p=2, dim=1)
+scores = torch.mm(query_embeddings, document_embeddings.transpose(0, 1))
+for query, query_scores in zip(queries, scores):
+    doc_score_pairs = list(zip(documents, query_scores))
+    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
+    #Output passages & scores
+    print("Query:", query)
+    for document, score in doc_score_pairs:
+        print(score, document)
+```
+## License
+Arctic is licensed under the [Apache-2](https://www.apache.org/licenses/LICENSE-2.0). The released models can be used for commercial purposes free of charge.
+<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=6ad53892-f1e7-4d3a-a135-60ca6264a7aa" />

biencoder_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "pooling": "first_token"
+}

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "intfloat/e5-base-unsupervised",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.7.0.dev0",
+    "transformers": "4.39.3",
+    "pytorch": "2.1.0+cu121"
+  },
+  "prompts": {
+    "query": "query: ",
+    "document": "passage: "
+  },
+  "default_prompt_name": "document"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:172d385d66a8e09d15b4185f4224c55a4118837ae84103f32e0072a1cabf45f6
+size 218986928

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": false
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff