Tollef Jørgensen
ignore and some updates
77c842b
import faiss
import gradio as gr
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
idx = 0
index = None
newdoc = None
dataset = load_dataset("tollefj/rettsavgjoerelser_100samples_embeddings")
model = SentenceTransformer("NbAiLab/nb-sbert-base")
df = dataset["train"].to_pandas()
def build_doc_frame(df, idx):
doc = df.iloc[idx]
# as df:
doc_df = pd.DataFrame(doc).T
# keep only sentences + embedding:
doc_df = doc_df[["url", "sentences", "embedding"]]
# unpack the sentences and embedding in separate rows
doc_df = doc_df.explode(["sentences", "embedding"])
return doc_df
def get_doc_embeddings(doc):
return np.array(doc.embedding.tolist(), dtype="float32")
def faiss_search(query_str, K=5):
global idx
global index
global newdoc
# find idx from url:
# doc_idx = df[df.url == doc_url].index[0]
# idx = int(doc_idx)
target_emb = model.encode([query_str])
target_emb = np.array([target_emb.reshape(-1)])
faiss.normalize_L2(target_emb)
D, I = index.search(np.array(target_emb), K)
print(list(zip(D[0], I[0])))
# prettyprint the results:
pretty_results = []
for idx, score in zip(I[0], D[0]):
pretty_results.append((round(float(score), 3), newdoc.iloc[idx].sentences))
pretty_results_str = "\n".join([f"Score: {score}\t\t{sent}" for score, sent in pretty_results])
top_k_str = f"Top {K} results for: {query_str}"
# return str:
return f"{top_k_str}\n{pretty_results_str}"
# def DropdownSummary():
# next_opts = df.iloc[idx].summary.tolist()
# return gr.Dropdown.update(choices=next_opts, label="Velg fra oppsummeringene")
dropdown_opts = [doc.url for idx, doc in df.iterrows()]
with gr.Blocks() as demo:
gr.HTML(
"""
<h1>Lovdata rettsavgjørelser - semantisk søk</h1>
"""
)
def on_selection_change(selected_case):
global idx
global index
global newdoc
idx = df[df.url == selected_case].index[0]
print("Selection changed!")
print(selected_case)
newdoc = build_doc_frame(df, idx)
embeddings = get_doc_embeddings(newdoc)
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(768)
index.add(embeddings)
summary = df.iloc[idx].summary.tolist()
# make a nice html-formatted ul-li list:
summary_html = "<ul>" + "".join([f"<li>{sent}</li>" for sent in summary]) + "</ul>"
# summary_dropdown.update(choices=summary, label="Velg fra oppsummeringene")
url_html = f"<a href='{selected_case}' target='_blank'>{selected_case}</a>"
return summary_html, url_html
with gr.Row():
with gr.Column():
case_dropdown = gr.Dropdown(label="Velg en rettsavgjørelse", choices=dropdown_opts)
summary_html = gr.HTML(label="Predefinert oppsummering", placeholder="<p>Velg en sak først<p>")
case_url = gr.HTML(label="URL til rettsavgjørelse", placeholder="https://lovdata.no/...")
with gr.Column():
query = gr.Textbox(
label="Søk etter setninger",
lines=1,
placeholder="Kollisjon mellom to kjøretøy.",
)
k_slider = gr.Slider(minimum=1, maximum=10, label="Antall treff", value=5, step=1)
search_btn = gr.Button("Søk")
output = gr.Textbox(label="Resultater", lines=10)
case_dropdown.change(
on_selection_change,
inputs=[case_dropdown],
outputs=[summary_html, case_url],
)
search_btn.click(faiss_search, inputs=[query, k_slider], outputs=[output])
# clear_btn.click(None, inputs=[None, None], outputs=None)
# search_btn.click(faiss_search, inputs=[None, None, None], outputs=["text"])
# search_btn.click(faiss_search, inputs=[idx, query, k_slider], outputs=["text"])
demo.launch()