Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,32 +1,30 @@
|
|
1 |
import streamlit as st
|
2 |
-
import requests
|
3 |
-
import threading
|
4 |
-
from flask import Flask, request, jsonify
|
5 |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
import faiss
|
8 |
import numpy as np
|
9 |
import torch
|
10 |
|
11 |
-
|
12 |
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
|
13 |
qa_model = AutoModelForQuestionAnswering.from_pretrained("nlpaueb/legal-bert-base-uncased")
|
14 |
embedder = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
|
15 |
|
16 |
-
|
17 |
-
app = Flask(__name__)
|
18 |
-
|
19 |
-
|
20 |
dimension = 384 # Paraphrase-MiniLM-L6-v2 modell embedding dimenziója
|
21 |
index = faiss.IndexFlatL2(dimension)
|
22 |
document_chunks = []
|
23 |
chunk_embeddings = []
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
data = request.json
|
28 |
-
context = data.get("context")
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
# Szöveg feldarabolása és embedding létrehozása
|
31 |
paragraphs = context.split("\n\n")
|
32 |
for paragraph in paragraphs:
|
@@ -35,61 +33,27 @@ def index_document():
|
|
35 |
document_chunks.append(paragraph)
|
36 |
chunk_embeddings.append(embedding)
|
37 |
index.add(embedding)
|
38 |
-
|
39 |
-
|
40 |
-
@app.route("/answer", methods=["POST"])
|
41 |
-
def answer():
|
42 |
-
data = request.json
|
43 |
-
question = data.get("question")
|
44 |
-
|
45 |
-
# Kérdés embedding létrehozása és releváns bekezdések keresése
|
46 |
-
question_embedding = embedder.encode(question).reshape(1, -1)
|
47 |
-
D, I = index.search(question_embedding, k=3) # Legjobb 3 találat keresése
|
48 |
-
|
49 |
-
# Releváns bekezdések kiválasztása és válasz generálás
|
50 |
-
relevant_paragraphs = " ".join([document_chunks[i] for i in I[0]])
|
51 |
-
inputs = tokenizer.encode_plus(question, relevant_paragraphs, return_tensors="pt")
|
52 |
-
answer_start_scores, answer_end_scores = qa_model(**inputs).values()
|
53 |
-
|
54 |
-
# Legjobb válasz kiválasztása
|
55 |
-
answer_start = torch.argmax(answer_start_scores)
|
56 |
-
answer_end = torch.argmax(answer_end_scores) + 1
|
57 |
-
answer = tokenizer.convert_tokens_to_string(
|
58 |
-
tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])
|
59 |
-
)
|
60 |
-
|
61 |
-
return jsonify({"answer": answer})
|
62 |
-
|
63 |
-
def run_flask_app():
|
64 |
-
app.run(port=5000)
|
65 |
-
|
66 |
-
|
67 |
-
flask_thread = threading.Thread(target=run_flask_app)
|
68 |
-
flask_thread.start()
|
69 |
-
|
70 |
-
st.title("AI Jogi Chatbot FAISS-szel")
|
71 |
-
|
72 |
-
|
73 |
-
uploaded_file = st.file_uploader("Töltsön fel egy dokumentumot", type=["txt", "pdf"])
|
74 |
-
|
75 |
-
if uploaded_file:
|
76 |
-
context = uploaded_file.read().decode("utf-8") # Szöveg kinyerése
|
77 |
-
response = requests.post("http://localhost:5000/index_document", json={"context": context})
|
78 |
-
|
79 |
-
if response.status_code == 200:
|
80 |
-
st.write("A dokumentum sikeresen indexelve.")
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
2 |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
import faiss
|
5 |
import numpy as np
|
6 |
import torch
|
7 |
|
8 |
+
# Modell betöltése
|
9 |
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
|
10 |
qa_model = AutoModelForQuestionAnswering.from_pretrained("nlpaueb/legal-bert-base-uncased")
|
11 |
embedder = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
|
12 |
|
13 |
+
# FAISS index létrehozása
|
|
|
|
|
|
|
14 |
dimension = 384 # Paraphrase-MiniLM-L6-v2 modell embedding dimenziója
|
15 |
index = faiss.IndexFlatL2(dimension)
|
16 |
document_chunks = []
|
17 |
chunk_embeddings = []
|
18 |
|
19 |
+
# Streamlit alkalmazás
|
20 |
+
st.title("AI Jogi Chatbot FAISS-szel - Hugging Face Spaces")
|
|
|
|
|
21 |
|
22 |
+
# Dokumentum feltöltése és indexelése
|
23 |
+
uploaded_file = st.file_uploader("Töltsön fel egy dokumentumot", type=["txt", "pdf"])
|
24 |
+
|
25 |
+
if uploaded_file:
|
26 |
+
context = uploaded_file.read().decode("utf-8") # Szöveg kinyerése
|
27 |
+
|
28 |
# Szöveg feldarabolása és embedding létrehozása
|
29 |
paragraphs = context.split("\n\n")
|
30 |
for paragraph in paragraphs:
|
|
|
33 |
document_chunks.append(paragraph)
|
34 |
chunk_embeddings.append(embedding)
|
35 |
index.add(embedding)
|
36 |
+
|
37 |
+
st.write(f"A dokumentum sikeresen indexelve. {len(paragraphs)} bekezdés feldolgozva.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
# Felhasználói kérdés
|
40 |
+
question = st.text_input("Írja be a kérdését a dokumentumról:")
|
41 |
+
|
42 |
+
if question:
|
43 |
+
# Kérdés embedding létrehozása és releváns bekezdések keresése
|
44 |
+
question_embedding = embedder.encode(question).reshape(1, -1)
|
45 |
+
D, I = index.search(question_embedding, k=3) # Legjobb 3 találat keresése
|
46 |
+
|
47 |
+
# Releváns bekezdések kiválasztása és válasz generálás
|
48 |
+
relevant_paragraphs = " ".join([document_chunks[i] for i in I[0]])
|
49 |
+
inputs = tokenizer.encode_plus(question, relevant_paragraphs, return_tensors="pt")
|
50 |
+
answer_start_scores, answer_end_scores = qa_model(**inputs).values()
|
51 |
+
|
52 |
+
# Legjobb válasz kiválasztása
|
53 |
+
answer_start = torch.argmax(answer_start_scores)
|
54 |
+
answer_end = torch.argmax(answer_end_scores) + 1
|
55 |
+
answer = tokenizer.convert_tokens_to_string(
|
56 |
+
tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])
|
57 |
+
)
|
58 |
+
|
59 |
+
st.write("Válasz:", answer)
|