Tollef Jørgensen commited on
Commit
a05279b
1 Parent(s): 56d72d1

update with model+data

Browse files
Files changed (1) hide show
  1. app.py +72 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import gradio as gr
3
+ import numpy as np
4
+ import pandas as pd
5
+ from datasets import load_dataset
6
+ from sentence_transformers import SentenceTransformer, util
7
+
8
+
9
+ def build_doc_frame(df, idx=0):
10
+ doc = df.iloc[0]
11
+ # as df:
12
+ doc_df = pd.DataFrame(doc).T
13
+ # keep only sentences + embedding:
14
+ doc_df = doc_df[["url", "sentences", "embedding"]]
15
+ # unpack the sentences and embedding in separate rows
16
+ doc_df = doc_df.explode(["sentences", "embedding"])
17
+ return doc_df
18
+
19
+
20
+ def get_doc_embeddings(doc):
21
+ return np.array(doc.embedding.tolist(), dtype="float32")
22
+
23
+
24
+ def faiss_search(doc_idx, query_str, K=5):
25
+ # doc_idx is a choice option of (idx, text)
26
+ idx = doc_idx[0] - 1
27
+ newdoc = build_doc_frame(df, idx=idx)
28
+ embeddings = get_doc_embeddings(newdoc)
29
+
30
+ faiss.normalize_L2(embeddings)
31
+ index = faiss.IndexFlatIP(768)
32
+ index.add(embeddings)
33
+
34
+ query_str = "Skade mellom kjøretøy"
35
+ target_emb = model.encode([query_str])
36
+ target_emb = np.array([target_emb.reshape(-1)])
37
+ faiss.normalize_L2(target_emb)
38
+
39
+ D, I = index.search(np.array(target_emb), K)
40
+ print(list(zip(D[0], I[0])))
41
+
42
+ # prettyprint the results:
43
+ pretty_results = []
44
+ for idx, score in zip(I[0], D[0]):
45
+ pretty_results.append((round(float(score), 3), newdoc.iloc[idx].sentences))
46
+ pretty_results_str = "\n".join([f"{score}\t{sent}" for score, sent in pretty_results])
47
+ top_k_str = f"Top {K} results for: {query_str}"
48
+ underlines = "__" * 40
49
+
50
+ # return str:
51
+ return f"{top_k_str}\n{pretty_results_str}\n{underlines}"
52
+
53
+
54
+ dataset = load_dataset("tollefj/rettsavgjoerelser_100samples_embeddings")
55
+ model = SentenceTransformer("NbAiLab/nb-sbert-base")
56
+ df = dataset["train"].to_pandas()
57
+
58
+ dropdown_opts = [(idx + 1, f"\t{doc.summary[0][:60]}...") for idx, doc in df.iterrows()]
59
+
60
+ iface = gr.Interface(
61
+ fn=faiss_search,
62
+ inputs=[
63
+ gr.Dropdown(label="Select a court case", choices=dropdown_opts),
64
+ gr.Textbox(lines=2, placeholder="Your query here..."),
65
+ gr.Slider(minimum=1, maximum=10, label="Number of matches", value=5),
66
+ ],
67
+ outputs="text",
68
+ title="Lovdata rettsavgjørelser - semantisk søk",
69
+ description="Velg en rettsak og søk for å hente ut lignende setninger i saken",
70
+ )
71
+
72
+ iface.launch()