Spaces:
Runtime error
Runtime error
Anthony
commited on
Commit
·
4df3cb6
1
Parent(s):
e83d68b
app added
Browse files
app.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer, util
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
import gradio as gr
|
5 |
+
import json
|
6 |
+
import re
|
7 |
+
import numpy as np
|
8 |
+
import pickle
|
9 |
+
from datetime import datetime
|
10 |
+
from huggingface_hub import Repository
|
11 |
+
from datasets import load_dataset
|
12 |
+
|
13 |
+
############# Read in the data #############
|
14 |
+
|
15 |
+
access_token_1 = os.environ.get("HF_token")
|
16 |
+
|
17 |
+
dataset = load_dataset("acd424/tribunal_data", use_auth_token=access_token_1)
|
18 |
+
|
19 |
+
|
20 |
+
embeddings = dataset["train"]["embed"]
|
21 |
+
|
22 |
+
corpus = dataset["train"]["reason_text"]
|
23 |
+
|
24 |
+
files = dataset["train"]["file_name"]
|
25 |
+
|
26 |
+
all_cats = dataset["train"]["all_cats"]
|
27 |
+
|
28 |
+
print("The data has loaded")
|
29 |
+
|
30 |
+
# for saving a log
|
31 |
+
DATASET_REPO_URL = "https://huggingface.co/datasets/acd424/tribunal_data"
|
32 |
+
DATA_FILENAME = "queries_and_responces.txt"
|
33 |
+
DATA_FILE = os.path.join("data", DATA_FILENAME)
|
34 |
+
repo = Repository(
|
35 |
+
local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=access_token_1
|
36 |
+
)
|
37 |
+
|
38 |
+
################### Functions ##########################
|
39 |
+
|
40 |
+
|
41 |
+
def semantic_search(query, corpus=corpus, corpus_embeddings=embeddings, k=5):
|
42 |
+
"""Performs search to find vector in corpus closest to query (using cosine similarity)
|
43 |
+
and returns the top k resluts
|
44 |
+
|
45 |
+
Parameters
|
46 |
+
----------
|
47 |
+
query: str
|
48 |
+
The query the user wants to search with
|
49 |
+
corpus: list
|
50 |
+
Each item in the list is the text from a chunk
|
51 |
+
corpus_embedding: numpy.ndarray
|
52 |
+
The word-embedding vector representation of the chunk
|
53 |
+
k: int
|
54 |
+
The numer of results to print (default is 5)
|
55 |
+
|
56 |
+
Returns
|
57 |
+
-------
|
58 |
+
Prints output displaying top k results
|
59 |
+
"""
|
60 |
+
|
61 |
+
top_k = min(k, len(corpus_embeddings))
|
62 |
+
|
63 |
+
query_embedding = embedder.encode(query, convert_to_tensor=True)
|
64 |
+
|
65 |
+
# We use cosine-similarity and torch.topk to find the highest 5 scores
|
66 |
+
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
|
67 |
+
top_results = torch.topk(cos_scores, k=top_k)
|
68 |
+
highest_cos_score = cos_scores[list(np.argsort(cos_scores))[::-1][0]]
|
69 |
+
|
70 |
+
if highest_cos_score < 0.5:
|
71 |
+
final_string = "The query is either not detailed enough or is perhaps not an appropriate query"
|
72 |
+
else:
|
73 |
+
final_string = ""
|
74 |
+
n = 1
|
75 |
+
for score, idx in zip(top_results[0], top_results[1]):
|
76 |
+
final_string = final_string + f" Match: {n} \n"
|
77 |
+
final_string = (
|
78 |
+
final_string
|
79 |
+
+ f"https://www.gov.uk/employment-tribunal-decisions/{corpus[idx]}"
|
80 |
+
)
|
81 |
+
final_string = final_string + "\n ======== With Catergories ========= \n"
|
82 |
+
final_string = final_string + all_cats[idx]
|
83 |
+
final_string = (
|
84 |
+
final_string
|
85 |
+
+ f"\n ============================================ {cos_scores[list(np.argsort(cos_scores))[::-1][n-1]]} \n\n\n"
|
86 |
+
)
|
87 |
+
n += 1
|
88 |
+
|
89 |
+
return final_string
|
90 |
+
|
91 |
+
|
92 |
+
def produce_tribunal_out(query, corpus=files, tribunal_embeddings=embeddings):
|
93 |
+
# get context
|
94 |
+
context_string = semantic_search(
|
95 |
+
query=query, corpus=corpus, corpus_embeddings=tribunal_embeddings
|
96 |
+
)
|
97 |
+
# get time
|
98 |
+
time_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
|
99 |
+
|
100 |
+
# write to file
|
101 |
+
|
102 |
+
with open(DATA_FILE, "a") as f:
|
103 |
+
f.write(f"{time_now}|{query}\n")
|
104 |
+
commit_url = repo.push_to_hub()
|
105 |
+
print(commit_url)
|
106 |
+
|
107 |
+
return gr.update(value=context_string)
|
108 |
+
|
109 |
+
|
110 |
+
############### Specify models
|
111 |
+
|
112 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
113 |
+
|
114 |
+
############### The Appp
|
115 |
+
|
116 |
+
with gr.Blocks() as demo:
|
117 |
+
gr.Markdown("Employment tribunal demo")
|
118 |
+
|
119 |
+
# text input from user
|
120 |
+
inp = gr.Textbox(
|
121 |
+
placeholder="Type your employment tribunal query here - describe your problem",
|
122 |
+
label="Tribunal Query",
|
123 |
+
)
|
124 |
+
|
125 |
+
# initiate the functions
|
126 |
+
process_btn = gr.Button("Search records from UK employment tribunals")
|
127 |
+
|
128 |
+
Suggested_text = gr.Textbox(
|
129 |
+
value="Suggestions will appear here", label="", lines=20
|
130 |
+
)
|
131 |
+
|
132 |
+
Suggested_text.style(show_copy_button=True)
|
133 |
+
|
134 |
+
process_btn.click(
|
135 |
+
fn=produce_tribunal_out,
|
136 |
+
inputs=[inp],
|
137 |
+
outputs=Suggested_text,
|
138 |
+
)
|
139 |
+
|
140 |
+
demo.launch(auth=("admin", "dataisking"))
|
141 |
+
|
142 |
+
|
143 |
+
##########################################
|