Spaces:

acd424
/

tribunal_search

Runtime error

App Files Files Community

Anthony commited on May 8, 2023

Commit

4df3cb6

1 Parent(s): e83d68b

app added

Browse files

Files changed (1) hide show

app.py +143 -0

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from sentence_transformers import SentenceTransformer, util
+import torch
+import os
+import gradio as gr
+import json
+import re
+import numpy as np
+import pickle
+from datetime import datetime
+from huggingface_hub import Repository
+from datasets import load_dataset
+############# Read in the data #############
+access_token_1 = os.environ.get("HF_token")
+dataset = load_dataset("acd424/tribunal_data", use_auth_token=access_token_1)
+embeddings = dataset["train"]["embed"]
+corpus = dataset["train"]["reason_text"]
+files = dataset["train"]["file_name"]
+all_cats = dataset["train"]["all_cats"]
+print("The data has loaded")
+# for saving a log
+DATASET_REPO_URL = "https://huggingface.co/datasets/acd424/tribunal_data"
+DATA_FILENAME = "queries_and_responces.txt"
+DATA_FILE = os.path.join("data", DATA_FILENAME)
+repo = Repository(
+    local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=access_token_1
+)
+################### Functions ##########################
+def semantic_search(query, corpus=corpus, corpus_embeddings=embeddings, k=5):
+    """Performs search to find vector in corpus closest to query (using cosine similarity)
+    and returns the top k resluts
+    Parameters
+    ----------
+    query: str
+        The query the user wants to search with
+    corpus: list
+        Each item in the list is the text from a chunk
+    corpus_embedding: numpy.ndarray
+        The word-embedding vector representation of the chunk
+    k: int
+        The numer of results to print (default is 5)
+    Returns
+    -------
+    Prints output displaying top k results
+    """
+    top_k = min(k, len(corpus_embeddings))
+    query_embedding = embedder.encode(query, convert_to_tensor=True)
+    # We use cosine-similarity and torch.topk to find the highest 5 scores
+    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
+    top_results = torch.topk(cos_scores, k=top_k)
+    highest_cos_score = cos_scores[list(np.argsort(cos_scores))[::-1][0]]
+    if highest_cos_score < 0.5:
+        final_string = "The query is either not detailed enough or is perhaps not an appropriate query"
+    else:
+        final_string = ""
+        n = 1
+        for score, idx in zip(top_results[0], top_results[1]):
+            final_string = final_string + f" Match: {n} \n"
+            final_string = (
+                final_string
+                + f"https://www.gov.uk/employment-tribunal-decisions/{corpus[idx]}"
+            )
+            final_string = final_string + "\n   ======== With Catergories =========  \n"
+            final_string = final_string + all_cats[idx]
+            final_string = (
+                final_string
+                + f"\n   ============================================ {cos_scores[list(np.argsort(cos_scores))[::-1][n-1]]} \n\n\n"
+            )
+            n += 1
+    return final_string
+def produce_tribunal_out(query, corpus=files, tribunal_embeddings=embeddings):
+    # get context
+    context_string = semantic_search(
+        query=query, corpus=corpus, corpus_embeddings=tribunal_embeddings
+    )
+    # get time
+    time_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
+    # write to file
+    with open(DATA_FILE, "a") as f:
+        f.write(f"{time_now}|{query}\n")
+    commit_url = repo.push_to_hub()
+    print(commit_url)
+    return gr.update(value=context_string)
+############### Specify models
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+############### The Appp
+with gr.Blocks() as demo:
+    gr.Markdown("Employment tribunal demo")
+    # text input from user
+    inp = gr.Textbox(
+        placeholder="Type your employment tribunal query here - describe your problem",
+        label="Tribunal Query",
+    )
+    # initiate the functions
+    process_btn = gr.Button("Search records from UK employment tribunals")
+    Suggested_text = gr.Textbox(
+        value="Suggestions will appear here", label="", lines=20
+    )
+    Suggested_text.style(show_copy_button=True)
+    process_btn.click(
+        fn=produce_tribunal_out,
+        inputs=[inp],
+        outputs=Suggested_text,
+    )
+demo.launch(auth=("admin", "dataisking"))
+##########################################