Spaces:
Runtime error
Runtime error
Tollef Jørgensen
commited on
Commit
•
a05279b
1
Parent(s):
56d72d1
update with model+data
Browse files
app.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import faiss
|
2 |
+
import gradio as gr
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
from datasets import load_dataset
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
+
|
8 |
+
|
9 |
+
def build_doc_frame(df, idx=0):
|
10 |
+
doc = df.iloc[0]
|
11 |
+
# as df:
|
12 |
+
doc_df = pd.DataFrame(doc).T
|
13 |
+
# keep only sentences + embedding:
|
14 |
+
doc_df = doc_df[["url", "sentences", "embedding"]]
|
15 |
+
# unpack the sentences and embedding in separate rows
|
16 |
+
doc_df = doc_df.explode(["sentences", "embedding"])
|
17 |
+
return doc_df
|
18 |
+
|
19 |
+
|
20 |
+
def get_doc_embeddings(doc):
|
21 |
+
return np.array(doc.embedding.tolist(), dtype="float32")
|
22 |
+
|
23 |
+
|
24 |
+
def faiss_search(doc_idx, query_str, K=5):
|
25 |
+
# doc_idx is a choice option of (idx, text)
|
26 |
+
idx = doc_idx[0] - 1
|
27 |
+
newdoc = build_doc_frame(df, idx=idx)
|
28 |
+
embeddings = get_doc_embeddings(newdoc)
|
29 |
+
|
30 |
+
faiss.normalize_L2(embeddings)
|
31 |
+
index = faiss.IndexFlatIP(768)
|
32 |
+
index.add(embeddings)
|
33 |
+
|
34 |
+
query_str = "Skade mellom kjøretøy"
|
35 |
+
target_emb = model.encode([query_str])
|
36 |
+
target_emb = np.array([target_emb.reshape(-1)])
|
37 |
+
faiss.normalize_L2(target_emb)
|
38 |
+
|
39 |
+
D, I = index.search(np.array(target_emb), K)
|
40 |
+
print(list(zip(D[0], I[0])))
|
41 |
+
|
42 |
+
# prettyprint the results:
|
43 |
+
pretty_results = []
|
44 |
+
for idx, score in zip(I[0], D[0]):
|
45 |
+
pretty_results.append((round(float(score), 3), newdoc.iloc[idx].sentences))
|
46 |
+
pretty_results_str = "\n".join([f"{score}\t{sent}" for score, sent in pretty_results])
|
47 |
+
top_k_str = f"Top {K} results for: {query_str}"
|
48 |
+
underlines = "__" * 40
|
49 |
+
|
50 |
+
# return str:
|
51 |
+
return f"{top_k_str}\n{pretty_results_str}\n{underlines}"
|
52 |
+
|
53 |
+
|
54 |
+
dataset = load_dataset("tollefj/rettsavgjoerelser_100samples_embeddings")
|
55 |
+
model = SentenceTransformer("NbAiLab/nb-sbert-base")
|
56 |
+
df = dataset["train"].to_pandas()
|
57 |
+
|
58 |
+
dropdown_opts = [(idx + 1, f"\t{doc.summary[0][:60]}...") for idx, doc in df.iterrows()]
|
59 |
+
|
60 |
+
iface = gr.Interface(
|
61 |
+
fn=faiss_search,
|
62 |
+
inputs=[
|
63 |
+
gr.Dropdown(label="Select a court case", choices=dropdown_opts),
|
64 |
+
gr.Textbox(lines=2, placeholder="Your query here..."),
|
65 |
+
gr.Slider(minimum=1, maximum=10, label="Number of matches", value=5),
|
66 |
+
],
|
67 |
+
outputs="text",
|
68 |
+
title="Lovdata rettsavgjørelser - semantisk søk",
|
69 |
+
description="Velg en rettsak og søk for å hente ut lignende setninger i saken",
|
70 |
+
)
|
71 |
+
|
72 |
+
iface.launch()
|