File size: 3,955 Bytes
a05279b
 
 
 
 
8eef9b2
a05279b
4d794c6
77c842b
 
a05279b
4d794c6
 
 
 
 
 
 
a05279b
 
 
 
 
 
 
 
 
 
 
 
 
77c842b
4d794c6
77c842b
 
4d794c6
77c842b
 
a05279b
 
 
 
 
 
 
 
 
 
 
4d794c6
a05279b
 
 
4d794c6
a05279b
 
4d794c6
 
 
 
 
 
 
 
77c842b
 
 
 
4d794c6
 
77c842b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d794c6
2de00c6
4d794c6
77c842b
 
 
 
 
 
 
4d794c6
 
 
 
a05279b
4d794c6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import faiss
import gradio as gr
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

idx = 0
index = None
newdoc = None

dataset = load_dataset("tollefj/rettsavgjoerelser_100samples_embeddings")
model = SentenceTransformer("NbAiLab/nb-sbert-base")
df = dataset["train"].to_pandas()


def build_doc_frame(df, idx):
    doc = df.iloc[idx]
    # as df:
    doc_df = pd.DataFrame(doc).T
    # keep only sentences + embedding:
    doc_df = doc_df[["url", "sentences", "embedding"]]
    # unpack the sentences and embedding in separate rows
    doc_df = doc_df.explode(["sentences", "embedding"])
    return doc_df


def get_doc_embeddings(doc):
    return np.array(doc.embedding.tolist(), dtype="float32")


def faiss_search(query_str, K=5):
    global idx
    global index
    global newdoc
    # find idx from url:
    # doc_idx = df[df.url == doc_url].index[0]
    # idx = int(doc_idx)
    target_emb = model.encode([query_str])
    target_emb = np.array([target_emb.reshape(-1)])
    faiss.normalize_L2(target_emb)

    D, I = index.search(np.array(target_emb), K)
    print(list(zip(D[0], I[0])))

    # prettyprint the results:
    pretty_results = []
    for idx, score in zip(I[0], D[0]):
        pretty_results.append((round(float(score), 3), newdoc.iloc[idx].sentences))
    pretty_results_str = "\n".join([f"Score: {score}\t\t{sent}" for score, sent in pretty_results])
    top_k_str = f"Top {K} results for: {query_str}"

    # return str:
    return f"{top_k_str}\n{pretty_results_str}"


# def DropdownSummary():
#     next_opts = df.iloc[idx].summary.tolist()
#     return gr.Dropdown.update(choices=next_opts, label="Velg fra oppsummeringene")


dropdown_opts = [doc.url for idx, doc in df.iterrows()]

with gr.Blocks() as demo:
    gr.HTML(
        """
        <h1>Lovdata rettsavgjørelser - semantisk søk</h1>
        """
    )

    def on_selection_change(selected_case):
        global idx
        global index
        global newdoc
        idx = df[df.url == selected_case].index[0]
        print("Selection changed!")
        print(selected_case)

        newdoc = build_doc_frame(df, idx)
        embeddings = get_doc_embeddings(newdoc)
        faiss.normalize_L2(embeddings)
        index = faiss.IndexFlatIP(768)
        index.add(embeddings)

        summary = df.iloc[idx].summary.tolist()
        # make a nice html-formatted ul-li list:
        summary_html = "<ul>" + "".join([f"<li>{sent}</li>" for sent in summary]) + "</ul>"
        # summary_dropdown.update(choices=summary, label="Velg fra oppsummeringene")
        url_html = f"<a href='{selected_case}' target='_blank'>{selected_case}</a>"
        return summary_html, url_html

    with gr.Row():
        with gr.Column():
            case_dropdown = gr.Dropdown(label="Velg en rettsavgjørelse", choices=dropdown_opts)
            summary_html = gr.HTML(label="Predefinert oppsummering", placeholder="<p>Velg en sak først<p>")
            case_url = gr.HTML(label="URL til rettsavgjørelse", placeholder="https://lovdata.no/...")
        with gr.Column():
            query = gr.Textbox(
                label="Søk etter setninger",
                lines=1,
                placeholder="Kollisjon mellom to kjøretøy.",
            )
            k_slider = gr.Slider(minimum=1, maximum=10, label="Antall treff", value=5, step=1)
            search_btn = gr.Button("Søk")

    output = gr.Textbox(label="Resultater", lines=10)

    case_dropdown.change(
        on_selection_change,
        inputs=[case_dropdown],
        outputs=[summary_html, case_url],
    )

    search_btn.click(faiss_search, inputs=[query, k_slider], outputs=[output])

    # clear_btn.click(None, inputs=[None, None], outputs=None)
    # search_btn.click(faiss_search, inputs=[None, None, None], outputs=["text"])
    # search_btn.click(faiss_search, inputs=[idx, query, k_slider], outputs=["text"])

demo.launch()