Anthony commited on
Commit
4df3cb6
·
1 Parent(s): e83d68b
Files changed (1) hide show
  1. app.py +143 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ import torch
3
+ import os
4
+ import gradio as gr
5
+ import json
6
+ import re
7
+ import numpy as np
8
+ import pickle
9
+ from datetime import datetime
10
+ from huggingface_hub import Repository
11
+ from datasets import load_dataset
12
+
13
+ ############# Read in the data #############
14
+
15
+ access_token_1 = os.environ.get("HF_token")
16
+
17
+ dataset = load_dataset("acd424/tribunal_data", use_auth_token=access_token_1)
18
+
19
+
20
+ embeddings = dataset["train"]["embed"]
21
+
22
+ corpus = dataset["train"]["reason_text"]
23
+
24
+ files = dataset["train"]["file_name"]
25
+
26
+ all_cats = dataset["train"]["all_cats"]
27
+
28
+ print("The data has loaded")
29
+
30
+ # for saving a log
31
+ DATASET_REPO_URL = "https://huggingface.co/datasets/acd424/tribunal_data"
32
+ DATA_FILENAME = "queries_and_responces.txt"
33
+ DATA_FILE = os.path.join("data", DATA_FILENAME)
34
+ repo = Repository(
35
+ local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=access_token_1
36
+ )
37
+
38
+ ################### Functions ##########################
39
+
40
+
41
+ def semantic_search(query, corpus=corpus, corpus_embeddings=embeddings, k=5):
42
+ """Performs search to find vector in corpus closest to query (using cosine similarity)
43
+ and returns the top k resluts
44
+
45
+ Parameters
46
+ ----------
47
+ query: str
48
+ The query the user wants to search with
49
+ corpus: list
50
+ Each item in the list is the text from a chunk
51
+ corpus_embedding: numpy.ndarray
52
+ The word-embedding vector representation of the chunk
53
+ k: int
54
+ The numer of results to print (default is 5)
55
+
56
+ Returns
57
+ -------
58
+ Prints output displaying top k results
59
+ """
60
+
61
+ top_k = min(k, len(corpus_embeddings))
62
+
63
+ query_embedding = embedder.encode(query, convert_to_tensor=True)
64
+
65
+ # We use cosine-similarity and torch.topk to find the highest 5 scores
66
+ cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
67
+ top_results = torch.topk(cos_scores, k=top_k)
68
+ highest_cos_score = cos_scores[list(np.argsort(cos_scores))[::-1][0]]
69
+
70
+ if highest_cos_score < 0.5:
71
+ final_string = "The query is either not detailed enough or is perhaps not an appropriate query"
72
+ else:
73
+ final_string = ""
74
+ n = 1
75
+ for score, idx in zip(top_results[0], top_results[1]):
76
+ final_string = final_string + f" Match: {n} \n"
77
+ final_string = (
78
+ final_string
79
+ + f"https://www.gov.uk/employment-tribunal-decisions/{corpus[idx]}"
80
+ )
81
+ final_string = final_string + "\n ======== With Catergories ========= \n"
82
+ final_string = final_string + all_cats[idx]
83
+ final_string = (
84
+ final_string
85
+ + f"\n ============================================ {cos_scores[list(np.argsort(cos_scores))[::-1][n-1]]} \n\n\n"
86
+ )
87
+ n += 1
88
+
89
+ return final_string
90
+
91
+
92
+ def produce_tribunal_out(query, corpus=files, tribunal_embeddings=embeddings):
93
+ # get context
94
+ context_string = semantic_search(
95
+ query=query, corpus=corpus, corpus_embeddings=tribunal_embeddings
96
+ )
97
+ # get time
98
+ time_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
99
+
100
+ # write to file
101
+
102
+ with open(DATA_FILE, "a") as f:
103
+ f.write(f"{time_now}|{query}\n")
104
+ commit_url = repo.push_to_hub()
105
+ print(commit_url)
106
+
107
+ return gr.update(value=context_string)
108
+
109
+
110
+ ############### Specify models
111
+
112
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
113
+
114
+ ############### The Appp
115
+
116
+ with gr.Blocks() as demo:
117
+ gr.Markdown("Employment tribunal demo")
118
+
119
+ # text input from user
120
+ inp = gr.Textbox(
121
+ placeholder="Type your employment tribunal query here - describe your problem",
122
+ label="Tribunal Query",
123
+ )
124
+
125
+ # initiate the functions
126
+ process_btn = gr.Button("Search records from UK employment tribunals")
127
+
128
+ Suggested_text = gr.Textbox(
129
+ value="Suggestions will appear here", label="", lines=20
130
+ )
131
+
132
+ Suggested_text.style(show_copy_button=True)
133
+
134
+ process_btn.click(
135
+ fn=produce_tribunal_out,
136
+ inputs=[inp],
137
+ outputs=Suggested_text,
138
+ )
139
+
140
+ demo.launch(auth=("admin", "dataisking"))
141
+
142
+
143
+ ##########################################