Spaces:
Runtime error
Runtime error
File size: 3,955 Bytes
a05279b 8eef9b2 a05279b 4d794c6 77c842b a05279b 4d794c6 a05279b 77c842b 4d794c6 77c842b 4d794c6 77c842b a05279b 4d794c6 a05279b 4d794c6 a05279b 4d794c6 77c842b 4d794c6 77c842b 4d794c6 2de00c6 4d794c6 77c842b 4d794c6 a05279b 4d794c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import faiss
import gradio as gr
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
idx = 0
index = None
newdoc = None
dataset = load_dataset("tollefj/rettsavgjoerelser_100samples_embeddings")
model = SentenceTransformer("NbAiLab/nb-sbert-base")
df = dataset["train"].to_pandas()
def build_doc_frame(df, idx):
doc = df.iloc[idx]
# as df:
doc_df = pd.DataFrame(doc).T
# keep only sentences + embedding:
doc_df = doc_df[["url", "sentences", "embedding"]]
# unpack the sentences and embedding in separate rows
doc_df = doc_df.explode(["sentences", "embedding"])
return doc_df
def get_doc_embeddings(doc):
return np.array(doc.embedding.tolist(), dtype="float32")
def faiss_search(query_str, K=5):
global idx
global index
global newdoc
# find idx from url:
# doc_idx = df[df.url == doc_url].index[0]
# idx = int(doc_idx)
target_emb = model.encode([query_str])
target_emb = np.array([target_emb.reshape(-1)])
faiss.normalize_L2(target_emb)
D, I = index.search(np.array(target_emb), K)
print(list(zip(D[0], I[0])))
# prettyprint the results:
pretty_results = []
for idx, score in zip(I[0], D[0]):
pretty_results.append((round(float(score), 3), newdoc.iloc[idx].sentences))
pretty_results_str = "\n".join([f"Score: {score}\t\t{sent}" for score, sent in pretty_results])
top_k_str = f"Top {K} results for: {query_str}"
# return str:
return f"{top_k_str}\n{pretty_results_str}"
# def DropdownSummary():
# next_opts = df.iloc[idx].summary.tolist()
# return gr.Dropdown.update(choices=next_opts, label="Velg fra oppsummeringene")
dropdown_opts = [doc.url for idx, doc in df.iterrows()]
with gr.Blocks() as demo:
gr.HTML(
"""
<h1>Lovdata rettsavgjørelser - semantisk søk</h1>
"""
)
def on_selection_change(selected_case):
global idx
global index
global newdoc
idx = df[df.url == selected_case].index[0]
print("Selection changed!")
print(selected_case)
newdoc = build_doc_frame(df, idx)
embeddings = get_doc_embeddings(newdoc)
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(768)
index.add(embeddings)
summary = df.iloc[idx].summary.tolist()
# make a nice html-formatted ul-li list:
summary_html = "<ul>" + "".join([f"<li>{sent}</li>" for sent in summary]) + "</ul>"
# summary_dropdown.update(choices=summary, label="Velg fra oppsummeringene")
url_html = f"<a href='{selected_case}' target='_blank'>{selected_case}</a>"
return summary_html, url_html
with gr.Row():
with gr.Column():
case_dropdown = gr.Dropdown(label="Velg en rettsavgjørelse", choices=dropdown_opts)
summary_html = gr.HTML(label="Predefinert oppsummering", placeholder="<p>Velg en sak først<p>")
case_url = gr.HTML(label="URL til rettsavgjørelse", placeholder="https://lovdata.no/...")
with gr.Column():
query = gr.Textbox(
label="Søk etter setninger",
lines=1,
placeholder="Kollisjon mellom to kjøretøy.",
)
k_slider = gr.Slider(minimum=1, maximum=10, label="Antall treff", value=5, step=1)
search_btn = gr.Button("Søk")
output = gr.Textbox(label="Resultater", lines=10)
case_dropdown.change(
on_selection_change,
inputs=[case_dropdown],
outputs=[summary_html, case_url],
)
search_btn.click(faiss_search, inputs=[query, k_slider], outputs=[output])
# clear_btn.click(None, inputs=[None, None], outputs=None)
# search_btn.click(faiss_search, inputs=[None, None, None], outputs=["text"])
# search_btn.click(faiss_search, inputs=[idx, query, k_slider], outputs=["text"])
demo.launch()
|