Spaces:
Sleeping
Sleeping
Commit
·
1df3d2a
1
Parent(s):
0f6450a
Upload local project
Browse files- .gitattributes +2 -0
- .gitignore +9 -0
- LICENSE +21 -0
- README.md +18 -9
- app/gradio_ui.py +44 -0
- app/ingest.py +26 -0
- app/managers/vector_manager.py +55 -0
- app/models/model.py +14 -0
- app/rag_chain.py +38 -0
- app/utils/handle_api.py +13 -0
- app/utils/pdf2vector.py +6 -0
- build.sh +1 -0
- data/fedlex-data-admin-ch-de.pdf +3 -0
- data/fedlex-data-admin-ch-en.pdf +3 -0
- examples/questions.txt +2 -0
- requirements.txt +13 -0
- run.py +4 -0
- vectorstore/index.faiss +3 -0
- vectorstore/index.pkl +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.faiss filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**/__pycache__/
|
2 |
+
**/.vscode/
|
3 |
+
**/.ipynb_checkpoints/
|
4 |
+
**/.pytest_cache/
|
5 |
+
# scripts/
|
6 |
+
Archived/
|
7 |
+
.env
|
8 |
+
app/tests/
|
9 |
+
huggingface_manage.md
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 nanfang_wuyu
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,13 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.29.1
|
8 |
-
app_file:
|
9 |
-
pinned:
|
10 |
license: mit
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
1 |
+
<!-- ---
|
2 |
+
title: "LLM_Powered_Legal_RAG_Demo"
|
3 |
+
emoji: "📚"
|
4 |
+
colorFrom: "indigo"
|
5 |
+
colorTo: "pink"
|
6 |
+
sdk: gradio
|
7 |
+
python_version: 3.13
|
8 |
+
accelerator: cpu # gpu when local, be online in future
|
9 |
+
--- -->
|
10 |
---
|
11 |
+
title: LLM_Powered_Legal_RAG_Demo
|
12 |
+
emoji: 📚
|
13 |
+
colorFrom: indigo
|
14 |
+
colorTo: pink
|
15 |
sdk: gradio
|
16 |
sdk_version: 5.29.1
|
17 |
+
app_file: run.py
|
18 |
+
pinned: true
|
19 |
license: mit
|
20 |
+
accelerator: cpu
|
21 |
+
python_version: 3.13
|
22 |
+
---
|
app/gradio_ui.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from app.rag_chain import get_qa_chain
|
3 |
+
|
4 |
+
def load_example_questions(path="examples/questions.txt"):
|
5 |
+
with open(path, "r", encoding="utf-8") as f:
|
6 |
+
questions = [line.strip() for line in f if line.strip()]
|
7 |
+
return questions
|
8 |
+
|
9 |
+
example_questions = load_example_questions()
|
10 |
+
|
11 |
+
|
12 |
+
qa_chain = get_qa_chain()
|
13 |
+
|
14 |
+
def ask_question(query):
|
15 |
+
result = qa_chain.invoke({"input": query})
|
16 |
+
# print(result)
|
17 |
+
answer = result["answer"]
|
18 |
+
sources = "\n\nSources:\n\n" + "\n".join([f"{doc.metadata['source']} {doc.metadata['article_number']}" for doc in result["context"]])
|
19 |
+
return answer + sources
|
20 |
+
|
21 |
+
# iface = gr.Interface(fn=ask_question,
|
22 |
+
# inputs=gr.Textbox(label="Ask something about the document..."),
|
23 |
+
# outputs=gr.Textbox(label="Answer with sources"),
|
24 |
+
# title="LLM-Powered Legal RAG Demo")
|
25 |
+
|
26 |
+
def from_example_dropdown(selected_example):
|
27 |
+
return selected_example
|
28 |
+
|
29 |
+
with gr.Blocks() as iface:
|
30 |
+
gr.Markdown("## LLM-Powered Legal RAG Demo")
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
with gr.Row():
|
35 |
+
with gr.Column(scale=1):
|
36 |
+
textbox = gr.Textbox(label="Ask something about the Federal Act on Foreign Nationals and Integration ... ", lines=2)
|
37 |
+
dropdown = gr.Dropdown(label="📚 Choose an example question", choices=['']+example_questions)
|
38 |
+
dropdown.change(fn=from_example_dropdown, inputs=dropdown, outputs=textbox)
|
39 |
+
submit_btn = gr.Button("Submit")
|
40 |
+
|
41 |
+
with gr.Column(scale=1):
|
42 |
+
output = gr.Textbox(label="Answer with sources", lines=12)
|
43 |
+
|
44 |
+
submit_btn.click(fn=ask_question, inputs=textbox, outputs=output)
|
app/ingest.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.document_loaders import PyPDFLoader
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain.vectorstores import FAISS
|
4 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
+
from langchain.schema import Document
|
6 |
+
from app.managers import vector_manager as vm
|
7 |
+
import os
|
8 |
+
|
9 |
+
def ingest_fedlex_pdf(file_path):
|
10 |
+
loader = PyPDFLoader(file_path)
|
11 |
+
docs = loader.load()
|
12 |
+
full_text = "\n".join([doc.page_content for doc in docs])
|
13 |
+
|
14 |
+
chunks = full_text.split("Art.")
|
15 |
+
processed_docs = []
|
16 |
+
for i, chunk in enumerate(chunks[1:], start=1): # skip preamble
|
17 |
+
content = f"Art. {chunk.strip()}"
|
18 |
+
metadata = {
|
19 |
+
"source": os.path.basename(file_path),
|
20 |
+
"article_number": f"Art. {i}"
|
21 |
+
}
|
22 |
+
processed_docs.append(Document(page_content=content, metadata=metadata))
|
23 |
+
|
24 |
+
|
25 |
+
vm.add_multi_documents(processed_docs)
|
26 |
+
print(f"✅ Ingested {len(processed_docs)} articles from {file_path}")
|
app/managers/vector_manager.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import faiss
|
3 |
+
from typing import List, Optional
|
4 |
+
from langchain_community.vectorstores import FAISS
|
5 |
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
6 |
+
from langchain.schema import Document
|
7 |
+
from app.models.model import Embedding_model
|
8 |
+
|
9 |
+
BASE_PATH = "vectorstore"
|
10 |
+
|
11 |
+
def create_new_vectorstore(embedding_model):
|
12 |
+
dim = len(embedding_model.embed_query("hello world"))
|
13 |
+
index = faiss.IndexFlatL2(dim)
|
14 |
+
return FAISS(
|
15 |
+
embedding_function=embedding_model,
|
16 |
+
index=index,
|
17 |
+
docstore=InMemoryDocstore(),
|
18 |
+
index_to_docstore_id={}
|
19 |
+
)
|
20 |
+
|
21 |
+
|
22 |
+
def load_vectorstore() -> FAISS:
|
23 |
+
path = BASE_PATH
|
24 |
+
|
25 |
+
if os.path.exists(os.path.join(path, 'index.faiss')):
|
26 |
+
print("Reload existing faiss")
|
27 |
+
return FAISS.load_local(path, Embedding_model, allow_dangerous_deserialization=True)
|
28 |
+
else:
|
29 |
+
print("Create new faiss")
|
30 |
+
vs = create_new_vectorstore(Embedding_model)
|
31 |
+
save_vectorstore(vs)
|
32 |
+
return vs
|
33 |
+
|
34 |
+
|
35 |
+
def save_vectorstore(vectorstore: FAISS):
|
36 |
+
path = BASE_PATH
|
37 |
+
vectorstore.save_local(path)
|
38 |
+
|
39 |
+
|
40 |
+
def add_document(content: str, metadata: dict):
|
41 |
+
vs = load_vectorstore()
|
42 |
+
doc = Document(page_content=content, metadata=metadata)
|
43 |
+
vs.add_documents([doc])
|
44 |
+
save_vectorstore(vs)
|
45 |
+
|
46 |
+
def add_multi_documents(processed_docs: list):
|
47 |
+
vs = load_vectorstore()
|
48 |
+
vs.add_documents(processed_docs)
|
49 |
+
save_vectorstore(vs)
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
def get_relevant_documents(store_type: str, query: str, top_k: int = 5) -> List[Document]:
|
54 |
+
vs = load_vectorstore(store_type)
|
55 |
+
return vs.similarity_search(query, k=top_k)
|
app/models/model.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
2 |
+
from app.utils.handle_api import get_api
|
3 |
+
# from langchain_community.llms import OpenAI
|
4 |
+
from langchain_openai import ChatOpenAI
|
5 |
+
|
6 |
+
|
7 |
+
# model = TinyLlamaModel()
|
8 |
+
LLM = ChatOpenAI(
|
9 |
+
model="gpt-4o-mini",
|
10 |
+
api_key=get_api(),
|
11 |
+
temperature=0,
|
12 |
+
max_tokens=None
|
13 |
+
)
|
14 |
+
Embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
app/rag_chain.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chains import create_retrieval_chain
|
2 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
3 |
+
from langchain_core.prompts import ChatPromptTemplate
|
4 |
+
from app.models.model import LLM
|
5 |
+
from app.managers import vector_manager as vm
|
6 |
+
from langchain import hub
|
7 |
+
|
8 |
+
def get_qa_chain():
|
9 |
+
db = vm.load_vectorstore()
|
10 |
+
retriever = db.as_retriever(search_kwargs={"k": 5})
|
11 |
+
# qa_chain = RetrievalQA.from_chain_type(
|
12 |
+
# llm=LLM,
|
13 |
+
# retriever=retriever,
|
14 |
+
# return_source_documents=True
|
15 |
+
# )
|
16 |
+
|
17 |
+
# system_prompt = (
|
18 |
+
# "You are a helpful ... "
|
19 |
+
# "If you don't know the answer, say you don't know. "
|
20 |
+
# "... maximum and keep the answer concise. "
|
21 |
+
# "Context: {context}"
|
22 |
+
# )
|
23 |
+
|
24 |
+
# prompt = ChatPromptTemplate.from_messages(
|
25 |
+
# [
|
26 |
+
# ("system", system_prompt),
|
27 |
+
# ("human", "{input}"),
|
28 |
+
# ]
|
29 |
+
# )
|
30 |
+
prompt = hub.pull("langchain-ai/retrieval-qa-chat")
|
31 |
+
question_answer_chain = create_stuff_documents_chain(LLM, prompt)
|
32 |
+
chain = create_retrieval_chain(retriever, question_answer_chain)
|
33 |
+
|
34 |
+
# chain.invoke({"input": query})
|
35 |
+
|
36 |
+
return chain
|
37 |
+
|
38 |
+
|
app/utils/handle_api.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
import getpass
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
|
6 |
+
def get_api():
|
7 |
+
load_dotenv()
|
8 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
9 |
+
if api_key == None:
|
10 |
+
api_key = getpass.getpass("Enter API key for OpenAI: ")
|
11 |
+
if api_key == None:
|
12 |
+
raise ValueError("API key is required. Please set it in the environment variable OPENAI_API_KEY or provide it directly.")
|
13 |
+
return api_key
|
app/utils/pdf2vector.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from app.managers import vector_manager as vm
|
2 |
+
from app.ingest import ingest_fedlex_pdf
|
3 |
+
import os
|
4 |
+
|
5 |
+
if len(vm.load_vectorstore().docstore._dict.values()) == 0:
|
6 |
+
ingest_fedlex_pdf("data/fedlex-data-admin-ch-en.pdf")
|
build.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pip install -r requirements.txt
|
data/fedlex-data-admin-ch-de.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:36886540730b78fbe3495246937e7d08159d889cbb8f37b3ab90c1738d34109a
|
3 |
+
size 1116300
|
data/fedlex-data-admin-ch-en.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e96201d59e3f932aeaf6a32309d98ca2ac4dc0d4bab3d01f1c9df6482edc78ed
|
3 |
+
size 1057197
|
examples/questions.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Are non-EU Uzh International Masters students eligible to apply for a work permit after graduation?
|
2 |
+
I'm a non-EU student studying for a master's degree at the University of Zurich with a student B residence permit, can I legally work full-time during the summer breaks?
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
openai
|
3 |
+
faiss-cpu
|
4 |
+
sentence-transformers
|
5 |
+
PyPDF2
|
6 |
+
gradio
|
7 |
+
langchain-community
|
8 |
+
langchain_huggingface
|
9 |
+
langchain_openai
|
10 |
+
pydantic
|
11 |
+
ipykernel
|
12 |
+
pypdf
|
13 |
+
git-lfs
|
run.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from app import gradio_ui
|
2 |
+
|
3 |
+
if __name__ == "__main__":
|
4 |
+
gradio_ui.iface.launch()
|
vectorstore/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e409000abe563b62994c4d34b7e51351337fae63943a7df672ad3f85316d5fce
|
3 |
+
size 904749
|
vectorstore/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1441493373b74a67e26e39e2e3f0a953cc84b126be90c7e54805abe9b850d8bc
|
3 |
+
size 386380
|