Sabareeshr algomuffin commited on
Commit
e26015c
·
0 Parent(s):

Duplicate from algomuffin/neural-search-engine

Browse files

Co-authored-by: Alessandro Miragliotta <[email protected]>

Files changed (6) hide show
  1. .gitattributes +29 -0
  2. README.md +38 -0
  3. app.py +31 -0
  4. corpus.pkl +3 -0
  5. corpus_embeddings_cpu.pkl +3 -0
  6. requirements.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ corpus.pkl filter=lfs diff=lfs merge=lfs -text
29
+ corpus_embeddings_cpu.pkl filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Neural Search Engine
3
+ emoji: 📉
4
+ colorFrom: indigo
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ duplicated_from: algomuffin/neural-search-engine
10
+ ---
11
+
12
+ # Configuration
13
+
14
+ `title`: _string_
15
+ Display title for the Space
16
+
17
+ `emoji`: _string_
18
+ Space emoji (emoji-only character allowed)
19
+
20
+ `colorFrom`: _string_
21
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
22
+
23
+ `colorTo`: _string_
24
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
25
+
26
+ `sdk`: _string_
27
+ Can be either `gradio` or `streamlit`
28
+
29
+ `sdk_version` : _string_
30
+ Only applicable for `streamlit` SDK.
31
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
32
+
33
+ `app_file`: _string_
34
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
35
+ Path is relative to the root of the repository.
36
+
37
+ `pinned`: _boolean_
38
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
2
+ import torch
3
+ import pickle
4
+ import pandas as pd
5
+ import gradio as gr
6
+ bi_encoder = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
7
+ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
8
+ corpus_embeddings=pd.read_pickle("corpus_embeddings_cpu.pkl")
9
+ corpus=pd.read_pickle("corpus.pkl")
10
+ def search(query,top_k=100):
11
+ print("Top 5 Answer by the NSE:")
12
+ print()
13
+ ans=[]
14
+ ##### Sematic Search #####
15
+ # Encode the query using the bi-encoder and find potentially relevant passages
16
+ question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
17
+ hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
18
+ hits = hits[0] # Get the hits for the first query
19
+ ##### Re-Ranking #####
20
+ # Now, score all retrieved passages with the cross_encoder
21
+ cross_inp = [[query, corpus[hit['corpus_id']]] for hit in hits]
22
+ cross_scores = cross_encoder.predict(cross_inp)
23
+ # Sort results by the cross-encoder scores
24
+ for idx in range(len(cross_scores)):
25
+ hits[idx]['cross-score'] = cross_scores[idx]
26
+ hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
27
+
28
+ for idx, hit in enumerate(hits[0:5]):
29
+ ans.append(corpus[hit['corpus_id']])
30
+ return ans[0],ans[1],ans[2],ans[3],ans[4]
31
+ iface = gr.Interface(fn=search, inputs=["text"], outputs=["textbox","textbox","textbox","textbox","textbox"],examples=["How big is London?", "Where is Rome?","Who is steve jobs?","What is the most interesting thing about our universe?"],article="This is a semantic search engine powered by SentenceTransformers (Nils_Reimers) with a retrieval and reranking system on Wikipedia corpus. It will show the top 5 results",title="Neural Search Engine").launch()
corpus.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4db5fa7be970a2a5ada955d439e2501ca2cf25f79c417092dc17f2ef4c912419
3
+ size 41731886
corpus_embeddings_cpu.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba44cff67259c32981994a8978f1aaa513cc48d2003d021856155328be2852b
3
+ size 260501391
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ sentence-transformers==2.1.0
2
+ torch==1.10.0
3
+ pandas==1.1.5