Tollef Jørgensen
Update with reqs
8eef9b2
raw
history blame
2.28 kB
import faiss
import gradio as gr
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
def build_doc_frame(df, idx=0):
doc = df.iloc[0]
# as df:
doc_df = pd.DataFrame(doc).T
# keep only sentences + embedding:
doc_df = doc_df[["url", "sentences", "embedding"]]
# unpack the sentences and embedding in separate rows
doc_df = doc_df.explode(["sentences", "embedding"])
return doc_df
def get_doc_embeddings(doc):
return np.array(doc.embedding.tolist(), dtype="float32")
def faiss_search(doc_idx, query_str, K=5):
# doc_idx is a choice option of (idx, text)
idx = doc_idx[0] - 1
newdoc = build_doc_frame(df, idx=idx)
embeddings = get_doc_embeddings(newdoc)
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(768)
index.add(embeddings)
query_str = "Skade mellom kjøretøy"
target_emb = model.encode([query_str])
target_emb = np.array([target_emb.reshape(-1)])
faiss.normalize_L2(target_emb)
D, I = index.search(np.array(target_emb), K)
print(list(zip(D[0], I[0])))
# prettyprint the results:
pretty_results = []
for idx, score in zip(I[0], D[0]):
pretty_results.append((round(float(score), 3), newdoc.iloc[idx].sentences))
pretty_results_str = "\n".join([f"{score}\t{sent}" for score, sent in pretty_results])
top_k_str = f"Top {K} results for: {query_str}"
underlines = "__" * 40
# return str:
return f"{top_k_str}\n{pretty_results_str}\n{underlines}"
dataset = load_dataset("tollefj/rettsavgjoerelser_100samples_embeddings")
model = SentenceTransformer("NbAiLab/nb-sbert-base")
df = dataset["train"].to_pandas()
dropdown_opts = [(idx + 1, f"\t{doc.summary[0][:60]}...") for idx, doc in df.iterrows()]
iface = gr.Interface(
fn=faiss_search,
inputs=[
gr.Dropdown(label="Select a court case", choices=dropdown_opts),
gr.Textbox(lines=2, placeholder="Your query here..."),
gr.Slider(minimum=1, maximum=10, label="Number of matches", value=5),
],
outputs="text",
title="Lovdata rettsavgjørelser - semantisk søk",
description="Velg en rettsak og søk for å hente ut lignende setninger i saken",
)
iface.launch()