hanhainebula commited on
Commit
bc302de
·
verified ·
1 Parent(s): ac0f842

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ onnx/model.onnx_data filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 1024,
3
+ "pooling_mode_cls_token": true,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
README.md ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+ license: mit
8
+ ---
9
+
10
+ # BAAI-Multilingual-Base
11
+
12
+ **BAAI-Multilingual-Base** is a text embedding model distinguished for its versatility in Multi-Functionality, Multi-Linguality, and Multi-Granularity.
13
+
14
+ - Multi-Functionality: It can simultaneously perform the three common retrieval functionalities of embedding model: dense retrieval, multi-vector retrieval, and sparse retrieval.
15
+ - Multi-Linguality: It can support more than 100 working languages.
16
+ - Multi-Granularity: It is able to process inputs of different granularities, spanning from short sentences to long documents of up to 8192 tokens.
17
+
18
+
19
+ ## Usage
20
+
21
+ Install:
22
+ ```
23
+ pip install -U FlagEmbedding
24
+ ```
25
+
26
+ ### Generate Embedding for text
27
+
28
+ - Dense Embedding
29
+ ```python
30
+ from FlagEmbedding import BGEM3FlagModel
31
+
32
+ model = BGEM3FlagModel('hanhainebula/baai-multilingual-base',
33
+ use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
34
+
35
+ sentences_1 = ["What is BAAI-Multilingual-Base?", "Defination of BM25"]
36
+ sentences_2 = ["BAAI-Multilingual-Base is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.",
37
+ "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
38
+
39
+ embeddings_1 = model.encode(sentences_1,
40
+ batch_size=12,
41
+ max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
42
+ )['dense_vecs']
43
+ embeddings_2 = model.encode(sentences_2)['dense_vecs']
44
+ similarity = embeddings_1 @ embeddings_2.T
45
+ print(similarity)
46
+ # [[0.7026 0.439 ]
47
+ # [0.361 0.678 ]]
48
+ ```
49
+ You also can use sentence-transformers and huggingface transformers to generate dense embeddings.
50
+
51
+
52
+ - Sparse Embedding (Lexical Weight)
53
+ ```python
54
+ from FlagEmbedding import BGEM3FlagModel
55
+
56
+ model = BGEM3FlagModel('hanhainebula/baai-multilingual-base',
57
+ use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
58
+
59
+ sentences_1 = ["What is BAAI-Multilingual-Base?", "Defination of BM25"]
60
+ sentences_2 = ["BAAI-Multilingual-Base is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.",
61
+ "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
62
+
63
+ output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=False)
64
+ output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=False)
65
+
66
+ # you can see the weight for each token:
67
+ print(model.convert_id_to_token(output_1['lexical_weights']))
68
+ # [{'What': 0.10126, 'is': 0.1063, 'BA': 0.1858, 'AI': 0.2576, '-': 0.05154, 'Mul': 0.1381, 'ti': 0.1404, 'lingu': 0.2734, 'al': 0.10095,
69
+ # 'Bas': 0.2299, 'e': 0.153, '?': 0.05536}, {'De': 0.05002, 'fin': 0.1368, 'ation': 0.04495, 'of': 0.0633, 'BM': 0.2517, '25': 0.3333}]
70
+
71
+
72
+ # compute the scores via lexical mathcing
73
+ lexical_scores = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][0])
74
+ print(lexical_scores)
75
+ # 0.3666038513183594
76
+
77
+ print(model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_1['lexical_weights'][1]))
78
+ # 0.0
79
+ ```
80
+
81
+ - Multi-Vector (ColBERT)
82
+ ```python
83
+ from FlagEmbedding import BGEM3FlagModel
84
+
85
+ model = BGEM3FlagModel('hanhainebula/baai-multilingual-base',
86
+ use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
87
+
88
+ sentences_1 = ["What is BAAI-Multilingual-Base?", "Defination of BM25"]
89
+ sentences_2 = ["BAAI-Multilingual-Base is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.",
90
+ "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
91
+
92
+ output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=True)
93
+ output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=True)
94
+
95
+ print(model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][0]))
96
+ print(model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][1]))
97
+ # 0.7982
98
+ # 0.4389
99
+ ```
100
+
101
+
102
+ ### Compute score for text pairs
103
+ Input a list of text pairs, you can get the scores computed by different methods.
104
+ ```python
105
+ from FlagEmbedding import BGEM3FlagModel
106
+
107
+ model = BGEM3FlagModel('hanhainebula/baai-multilingual-base',
108
+ use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
109
+
110
+ sentences_1 = ["What is BAAI-Multilingual-Base?", "Defination of BM25"]
111
+ sentences_2 = ["BAAI-Multilingual-Base is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.",
112
+ "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]
113
+
114
+ sentence_pairs = [[i,j] for i in sentences_1 for j in sentences_2]
115
+
116
+ print(model.compute_score(sentence_pairs,
117
+ max_passage_length=128, # a smaller max length leads to a lower latency
118
+ weights_for_different_modes=[0.4, 0.2, 0.4])) # weights_for_different_modes(w) is used to do weighted sum: w[0]*dense_score + w[1]*sparse_score + w[2]*colbert_score
119
+
120
+ # {
121
+ # 'colbert': [0.7982305884361267, 0.438856840133667, 0.4464578628540039, 0.7897794842720032],
122
+ # 'sparse': [0.366455078125, 0.01297760009765625, 0.0, 0.1802978515625],
123
+ # 'dense': [0.70263671875, 0.43896484375, 0.361083984375, 0.67822265625],
124
+ # 'sparse+dense': [0.5905762314796448, 0.29696908593177795, 0.2407226711511612, 0.5122477412223816],
125
+ # 'colbert+sparse+dense': [0.6736379861831665, 0.3537241816520691, 0.3230167627334595, 0.6232604384422302]
126
+ # }
127
+ ```
colbert_linear.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19bfbae397c2b7524158c919d0e9b19393c5639d098f0a66932c91ed8f5f9abb
3
+ size 2100674
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "",
3
+ "architectures": [
4
+ "XLMRobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 8194,
17
+ "model_type": "xlm-roberta",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.33.0",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 250002
28
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.2.2",
4
+ "transformers": "4.33.0",
5
+ "pytorch": "2.1.2+cu121"
6
+ }
7
+ }
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5e0ce3470abf5ef3831aa1bd5553b486803e83251590ab7ff35a117cf6aad38
3
+ size 2271145830
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 8192,
3
+ "do_lower_case": false
4
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
sparse_linear.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45c93804d2142b8f6d7ec6914ae23a1eee9c6a1d27d83d908a20d2afb3595ad9
3
+ size 3516
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21106b6d7dab2952c1d496fb21d5dc9db75c28ed361a05f5020bbba27810dd08
3
+ size 17098108
tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "model_max_length": 8192,
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "sp_model_kwargs": {},
18
+ "tokenizer_class": "XLMRobertaTokenizer",
19
+ "unk_token": "<unk>"
20
+ }