Tollef Jørgensen commited on
Commit
8ecdadf
·
1 Parent(s): f597c2d
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. .gitignore +1 -0
  3. app.py +93 -0
  4. faiss.index.zip +3 -0
  5. faiss.lookup.csv +0 -0
  6. prep.py +5 -0
.gitattributes CHANGED
@@ -1,3 +1,4 @@
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
1
+ *.index
2
  *.7z filter=lfs diff=lfs merge=lfs -text
3
  *.arrow filter=lfs diff=lfs merge=lfs -text
4
  *.bin filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.index
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import gradio as gr
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sentence_transformers import SentenceTransformer
6
+ import zipfile
7
+
8
+ index_file = "faiss.index.zip"
9
+
10
+ with zipfile.ZipFile(index_file, "r") as z:
11
+ z.extractall()
12
+
13
+ pr_number = 14
14
+ model = SentenceTransformer(
15
+ "intfloat/multilingual-e5-small",
16
+ revision=f"refs/pr/{pr_number}",
17
+ backend="openvino",
18
+ )
19
+
20
+ class FaissIndex:
21
+ def __init__(
22
+ self,
23
+ model: SentenceTransformer,
24
+ data_path: str = "faiss.lookup.csv",
25
+ index_path="faiss.index",
26
+ ):
27
+ self.model = model
28
+ self.df = pd.read_csv(data_path)
29
+ self.index = faiss.read_index(index_path)
30
+
31
+ def search(self, query, k=5):
32
+ query = np.array(query).astype("float32")
33
+ distances, indices = self.index.search(query, k)
34
+ return distances, indices
35
+
36
+ def extract_docs(self, indices, k):
37
+ indices = list(indices[0])
38
+ lookup = self.df.iloc[indices]
39
+ questions = lookup["question"].values
40
+ answers = lookup["answer"].values
41
+
42
+ pairs = list(zip(questions, answers))
43
+ # ensure we only have unique answers. The questions can be duplicates
44
+ filtered_pairs = []
45
+ seen = set()
46
+ for pair in pairs:
47
+ if pair[1] not in seen:
48
+ seen.add(pair[1])
49
+ filtered_pairs.append(pair)
50
+
51
+ # format pairs as: f"{answer}\n{kilde: {question}}"
52
+ formatted_pairs = []
53
+ for pair in filtered_pairs:
54
+ formatted_pairs.append(f"{pair[1]} (kilde: {pair[0]})")
55
+ return formatted_pairs
56
+
57
+ def search(self, query: str, k: int = 5):
58
+ query = "query: " + query
59
+ enc = self.model.encode([query])
60
+ emb = np.array(enc).astype("float32").reshape(1, -1)
61
+ _, indices = self.index.search(emb, k)
62
+ return self.extract_docs(indices, k)
63
+
64
+
65
+ index = FaissIndex(model)
66
+
67
+
68
+ def query_faiss_index(søketekst):
69
+ """
70
+ Queries the FAISS index with the provided search text and returns the top 5 results.
71
+ Args:
72
+ søketekst (str): The search text to query the FAISS index.
73
+ Returns:
74
+ str: A string containing the top 5 search results, separated by double newlines.
75
+ """
76
+
77
+ results = index.search(søketekst, k=5)
78
+ return "\n\n".join(results)
79
+
80
+
81
+ # Create the Gradio interface
82
+ iface = gr.Interface(
83
+ fn=query_faiss_index,
84
+ inputs=gr.Textbox(lines=2, placeholder="Søk etter info i SIKT", interactive=True),
85
+ outputs=gr.Textbox(label="Søkeresultater", type="text", lines=15),
86
+ title="SIKT-FAQ",
87
+ description="Semantisk søk i SIKT med Openvino.",
88
+ live=True
89
+ )
90
+
91
+ # Launch the Gradio app
92
+ if __name__ == "__main__":
93
+ iface.launch()
faiss.index.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a85d3ff0d5a335201c3b92fb3466df469c4f3fb9569a087166d393ca7801527e
3
+ size 6895269
faiss.lookup.csv ADDED
The diff for this file is too large to render. See raw diff
 
prep.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import zipfile
2
+
3
+ index_file = "faiss.index"
4
+ with zipfile.ZipFile(index_file + ".zip", "w") as z:
5
+ z.write(index_file)