nanfangwuyu21 commited on
Commit
1df3d2a
·
1 Parent(s): 0f6450a

Upload local project

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.faiss filter=lfs diff=lfs merge=lfs -text
37
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ **/__pycache__/
2
+ **/.vscode/
3
+ **/.ipynb_checkpoints/
4
+ **/.pytest_cache/
5
+ # scripts/
6
+ Archived/
7
+ .env
8
+ app/tests/
9
+ huggingface_manage.md
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 nanfang_wuyu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,22 @@
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: LLM Powered Legal RAG
3
- emoji: 👁
4
- colorFrom: purple
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.29.1
8
- app_file: app.py
9
- pinned: false
10
  license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ <!-- ---
2
+ title: "LLM_Powered_Legal_RAG_Demo"
3
+ emoji: "📚"
4
+ colorFrom: "indigo"
5
+ colorTo: "pink"
6
+ sdk: gradio
7
+ python_version: 3.13
8
+ accelerator: cpu # gpu when local, be online in future
9
+ --- -->
10
  ---
11
+ title: LLM_Powered_Legal_RAG_Demo
12
+ emoji: 📚
13
+ colorFrom: indigo
14
+ colorTo: pink
15
  sdk: gradio
16
  sdk_version: 5.29.1
17
+ app_file: run.py
18
+ pinned: true
19
  license: mit
20
+ accelerator: cpu
21
+ python_version: 3.13
22
+ ---
app/gradio_ui.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from app.rag_chain import get_qa_chain
3
+
4
+ def load_example_questions(path="examples/questions.txt"):
5
+ with open(path, "r", encoding="utf-8") as f:
6
+ questions = [line.strip() for line in f if line.strip()]
7
+ return questions
8
+
9
+ example_questions = load_example_questions()
10
+
11
+
12
+ qa_chain = get_qa_chain()
13
+
14
+ def ask_question(query):
15
+ result = qa_chain.invoke({"input": query})
16
+ # print(result)
17
+ answer = result["answer"]
18
+ sources = "\n\nSources:\n\n" + "\n".join([f"{doc.metadata['source']} {doc.metadata['article_number']}" for doc in result["context"]])
19
+ return answer + sources
20
+
21
+ # iface = gr.Interface(fn=ask_question,
22
+ # inputs=gr.Textbox(label="Ask something about the document..."),
23
+ # outputs=gr.Textbox(label="Answer with sources"),
24
+ # title="LLM-Powered Legal RAG Demo")
25
+
26
+ def from_example_dropdown(selected_example):
27
+ return selected_example
28
+
29
+ with gr.Blocks() as iface:
30
+ gr.Markdown("## LLM-Powered Legal RAG Demo")
31
+
32
+
33
+
34
+ with gr.Row():
35
+ with gr.Column(scale=1):
36
+ textbox = gr.Textbox(label="Ask something about the Federal Act on Foreign Nationals and Integration ... ", lines=2)
37
+ dropdown = gr.Dropdown(label="📚 Choose an example question", choices=['']+example_questions)
38
+ dropdown.change(fn=from_example_dropdown, inputs=dropdown, outputs=textbox)
39
+ submit_btn = gr.Button("Submit")
40
+
41
+ with gr.Column(scale=1):
42
+ output = gr.Textbox(label="Answer with sources", lines=12)
43
+
44
+ submit_btn.click(fn=ask_question, inputs=textbox, outputs=output)
app/ingest.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.schema import Document
6
+ from app.managers import vector_manager as vm
7
+ import os
8
+
9
+ def ingest_fedlex_pdf(file_path):
10
+ loader = PyPDFLoader(file_path)
11
+ docs = loader.load()
12
+ full_text = "\n".join([doc.page_content for doc in docs])
13
+
14
+ chunks = full_text.split("Art.")
15
+ processed_docs = []
16
+ for i, chunk in enumerate(chunks[1:], start=1): # skip preamble
17
+ content = f"Art. {chunk.strip()}"
18
+ metadata = {
19
+ "source": os.path.basename(file_path),
20
+ "article_number": f"Art. {i}"
21
+ }
22
+ processed_docs.append(Document(page_content=content, metadata=metadata))
23
+
24
+
25
+ vm.add_multi_documents(processed_docs)
26
+ print(f"✅ Ingested {len(processed_docs)} articles from {file_path}")
app/managers/vector_manager.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import faiss
3
+ from typing import List, Optional
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain_community.docstore.in_memory import InMemoryDocstore
6
+ from langchain.schema import Document
7
+ from app.models.model import Embedding_model
8
+
9
+ BASE_PATH = "vectorstore"
10
+
11
+ def create_new_vectorstore(embedding_model):
12
+ dim = len(embedding_model.embed_query("hello world"))
13
+ index = faiss.IndexFlatL2(dim)
14
+ return FAISS(
15
+ embedding_function=embedding_model,
16
+ index=index,
17
+ docstore=InMemoryDocstore(),
18
+ index_to_docstore_id={}
19
+ )
20
+
21
+
22
+ def load_vectorstore() -> FAISS:
23
+ path = BASE_PATH
24
+
25
+ if os.path.exists(os.path.join(path, 'index.faiss')):
26
+ print("Reload existing faiss")
27
+ return FAISS.load_local(path, Embedding_model, allow_dangerous_deserialization=True)
28
+ else:
29
+ print("Create new faiss")
30
+ vs = create_new_vectorstore(Embedding_model)
31
+ save_vectorstore(vs)
32
+ return vs
33
+
34
+
35
+ def save_vectorstore(vectorstore: FAISS):
36
+ path = BASE_PATH
37
+ vectorstore.save_local(path)
38
+
39
+
40
+ def add_document(content: str, metadata: dict):
41
+ vs = load_vectorstore()
42
+ doc = Document(page_content=content, metadata=metadata)
43
+ vs.add_documents([doc])
44
+ save_vectorstore(vs)
45
+
46
+ def add_multi_documents(processed_docs: list):
47
+ vs = load_vectorstore()
48
+ vs.add_documents(processed_docs)
49
+ save_vectorstore(vs)
50
+
51
+
52
+
53
+ def get_relevant_documents(store_type: str, query: str, top_k: int = 5) -> List[Document]:
54
+ vs = load_vectorstore(store_type)
55
+ return vs.similarity_search(query, k=top_k)
app/models/model.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_huggingface import HuggingFaceEmbeddings
2
+ from app.utils.handle_api import get_api
3
+ # from langchain_community.llms import OpenAI
4
+ from langchain_openai import ChatOpenAI
5
+
6
+
7
+ # model = TinyLlamaModel()
8
+ LLM = ChatOpenAI(
9
+ model="gpt-4o-mini",
10
+ api_key=get_api(),
11
+ temperature=0,
12
+ max_tokens=None
13
+ )
14
+ Embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
app/rag_chain.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import create_retrieval_chain
2
+ from langchain.chains.combine_documents import create_stuff_documents_chain
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from app.models.model import LLM
5
+ from app.managers import vector_manager as vm
6
+ from langchain import hub
7
+
8
+ def get_qa_chain():
9
+ db = vm.load_vectorstore()
10
+ retriever = db.as_retriever(search_kwargs={"k": 5})
11
+ # qa_chain = RetrievalQA.from_chain_type(
12
+ # llm=LLM,
13
+ # retriever=retriever,
14
+ # return_source_documents=True
15
+ # )
16
+
17
+ # system_prompt = (
18
+ # "You are a helpful ... "
19
+ # "If you don't know the answer, say you don't know. "
20
+ # "... maximum and keep the answer concise. "
21
+ # "Context: {context}"
22
+ # )
23
+
24
+ # prompt = ChatPromptTemplate.from_messages(
25
+ # [
26
+ # ("system", system_prompt),
27
+ # ("human", "{input}"),
28
+ # ]
29
+ # )
30
+ prompt = hub.pull("langchain-ai/retrieval-qa-chat")
31
+ question_answer_chain = create_stuff_documents_chain(LLM, prompt)
32
+ chain = create_retrieval_chain(retriever, question_answer_chain)
33
+
34
+ # chain.invoke({"input": query})
35
+
36
+ return chain
37
+
38
+
app/utils/handle_api.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import getpass
4
+ from dotenv import load_dotenv
5
+
6
+ def get_api():
7
+ load_dotenv()
8
+ api_key = os.getenv("OPENAI_API_KEY")
9
+ if api_key == None:
10
+ api_key = getpass.getpass("Enter API key for OpenAI: ")
11
+ if api_key == None:
12
+ raise ValueError("API key is required. Please set it in the environment variable OPENAI_API_KEY or provide it directly.")
13
+ return api_key
app/utils/pdf2vector.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from app.managers import vector_manager as vm
2
+ from app.ingest import ingest_fedlex_pdf
3
+ import os
4
+
5
+ if len(vm.load_vectorstore().docstore._dict.values()) == 0:
6
+ ingest_fedlex_pdf("data/fedlex-data-admin-ch-en.pdf")
build.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ pip install -r requirements.txt
data/fedlex-data-admin-ch-de.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36886540730b78fbe3495246937e7d08159d889cbb8f37b3ab90c1738d34109a
3
+ size 1116300
data/fedlex-data-admin-ch-en.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e96201d59e3f932aeaf6a32309d98ca2ac4dc0d4bab3d01f1c9df6482edc78ed
3
+ size 1057197
examples/questions.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Are non-EU Uzh International Masters students eligible to apply for a work permit after graduation?
2
+ I'm a non-EU student studying for a master's degree at the University of Zurich with a student B residence permit, can I legally work full-time during the summer breaks?
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ faiss-cpu
4
+ sentence-transformers
5
+ PyPDF2
6
+ gradio
7
+ langchain-community
8
+ langchain_huggingface
9
+ langchain_openai
10
+ pydantic
11
+ ipykernel
12
+ pypdf
13
+ git-lfs
run.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from app import gradio_ui
2
+
3
+ if __name__ == "__main__":
4
+ gradio_ui.iface.launch()
vectorstore/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e409000abe563b62994c4d34b7e51351337fae63943a7df672ad3f85316d5fce
3
+ size 904749
vectorstore/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1441493373b74a67e26e39e2e3f0a953cc84b126be90c7e54805abe9b850d8bc
3
+ size 386380