Spaces:

tongyi21
/

LLM_Powered_Legal_RAG

Sleeping

App Files Files Community

nanfangwuyu21 commited on May 19

Commit

1df3d2a

1 Parent(s): 0f6450a

Upload local project

Browse files

Files changed (19) hide show

.gitattributes +2 -0
.gitignore +9 -0
LICENSE +21 -0
README.md +18 -9
app/gradio_ui.py +44 -0
app/ingest.py +26 -0
app/managers/vector_manager.py +55 -0
app/models/model.py +14 -0
app/rag_chain.py +38 -0
app/utils/handle_api.py +13 -0
app/utils/pdf2vector.py +6 -0
build.sh +1 -0
data/fedlex-data-admin-ch-de.pdf +3 -0
data/fedlex-data-admin-ch-en.pdf +3 -0
examples/questions.txt +2 -0
requirements.txt +13 -0
run.py +4 -0
vectorstore/index.faiss +3 -0
vectorstore/index.pkl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.faiss filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+**/__pycache__/
+**/.vscode/
+**/.ipynb_checkpoints/
+**/.pytest_cache/
+# scripts/
+Archived/
+.env
+app/tests/
+huggingface_manage.md

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 nanfang_wuyu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,22 @@
 ---
-title: LLM Powered Legal RAG
-emoji: 👁
-colorFrom: purple
-colorTo: purple
 sdk: gradio
 sdk_version: 5.29.1
-app_file: app.py
-pinned: false
 license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<!-- ---
+title: "LLM_Powered_Legal_RAG_Demo"
+emoji: "📚"
+colorFrom: "indigo"
+colorTo: "pink"
+sdk: gradio
+python_version: 3.13
+accelerator: cpu # gpu when local, be online in future
+--- -->
 ---
+title: LLM_Powered_Legal_RAG_Demo
+emoji: 📚
+colorFrom: indigo
+colorTo: pink
 sdk: gradio
 sdk_version: 5.29.1
+app_file: run.py
+pinned: true
 license: mit
+accelerator: cpu
+python_version: 3.13
+---

app/gradio_ui.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import gradio as gr
+from app.rag_chain import get_qa_chain
+def load_example_questions(path="examples/questions.txt"):
+    with open(path, "r", encoding="utf-8") as f:
+        questions = [line.strip() for line in f if line.strip()]
+    return questions
+example_questions = load_example_questions()
+qa_chain = get_qa_chain()
+def ask_question(query):
+    result = qa_chain.invoke({"input": query})
+    # print(result)
+    answer = result["answer"]
+    sources = "\n\nSources:\n\n" + "\n".join([f"{doc.metadata['source']} {doc.metadata['article_number']}" for doc in result["context"]])
+    return answer + sources
+# iface = gr.Interface(fn=ask_question,
+#                      inputs=gr.Textbox(label="Ask something about the document..."),
+#                      outputs=gr.Textbox(label="Answer with sources"),
+#                      title="LLM-Powered Legal RAG Demo")
+def from_example_dropdown(selected_example):
+    return selected_example
+with gr.Blocks() as iface:
+    gr.Markdown("## LLM-Powered Legal RAG Demo")
+    with gr.Row():
+        with gr.Column(scale=1):
+            textbox = gr.Textbox(label="Ask something about the Federal Act on Foreign Nationals and Integration ... ", lines=2)
+            dropdown = gr.Dropdown(label="📚 Choose an example question", choices=['']+example_questions)
+            dropdown.change(fn=from_example_dropdown, inputs=dropdown, outputs=textbox)
+            submit_btn = gr.Button("Submit")
+        with gr.Column(scale=1):
+            output = gr.Textbox(label="Answer with sources", lines=12)
+    submit_btn.click(fn=ask_question, inputs=textbox, outputs=output)

app/ingest.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.schema import Document
+from app.managers import vector_manager as vm
+import os
+def ingest_fedlex_pdf(file_path):
+    loader = PyPDFLoader(file_path)
+    docs = loader.load()
+    full_text = "\n".join([doc.page_content for doc in docs])
+    chunks = full_text.split("Art.")
+    processed_docs = []
+    for i, chunk in enumerate(chunks[1:], start=1):  # skip preamble
+        content = f"Art. {chunk.strip()}"
+        metadata = {
+            "source": os.path.basename(file_path),
+            "article_number": f"Art. {i}"
+        }
+        processed_docs.append(Document(page_content=content, metadata=metadata))
+    vm.add_multi_documents(processed_docs)
+    print(f"✅ Ingested {len(processed_docs)} articles from {file_path}")

app/managers/vector_manager.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import faiss
+from typing import List, Optional
+from langchain_community.vectorstores import FAISS
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain.schema import Document
+from app.models.model import Embedding_model
+BASE_PATH = "vectorstore"
+def create_new_vectorstore(embedding_model):
+    dim = len(embedding_model.embed_query("hello world"))
+    index = faiss.IndexFlatL2(dim)
+    return FAISS(
+        embedding_function=embedding_model,
+        index=index,
+        docstore=InMemoryDocstore(),
+        index_to_docstore_id={}
+    )
+def load_vectorstore() -> FAISS:
+    path = BASE_PATH
+    if os.path.exists(os.path.join(path, 'index.faiss')):
+        print("Reload existing faiss")
+        return FAISS.load_local(path, Embedding_model, allow_dangerous_deserialization=True)
+    else:
+        print("Create new faiss")
+        vs = create_new_vectorstore(Embedding_model)
+        save_vectorstore(vs)
+        return vs
+def save_vectorstore(vectorstore: FAISS):
+    path = BASE_PATH
+    vectorstore.save_local(path)
+def add_document(content: str, metadata: dict):
+    vs = load_vectorstore()
+    doc = Document(page_content=content, metadata=metadata)
+    vs.add_documents([doc])
+    save_vectorstore(vs)
+def add_multi_documents(processed_docs: list):
+    vs = load_vectorstore()
+    vs.add_documents(processed_docs)
+    save_vectorstore(vs)
+def get_relevant_documents(store_type: str, query: str, top_k: int = 5) -> List[Document]:
+    vs = load_vectorstore(store_type)
+    return vs.similarity_search(query, k=top_k)

app/models/model.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from langchain_huggingface import HuggingFaceEmbeddings
+from app.utils.handle_api import get_api
+# from langchain_community.llms import OpenAI
+from langchain_openai import ChatOpenAI
+# model = TinyLlamaModel()
+LLM = ChatOpenAI(
+    model="gpt-4o-mini",
+    api_key=get_api(),
+    temperature=0,
+    max_tokens=None
+    )
+Embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

app/rag_chain.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+from app.models.model import LLM
+from app.managers import vector_manager as vm
+from langchain import hub
+def get_qa_chain():
+    db = vm.load_vectorstore()
+    retriever = db.as_retriever(search_kwargs={"k": 5})
+    # qa_chain = RetrievalQA.from_chain_type(
+    #     llm=LLM,
+    #     retriever=retriever,
+    #     return_source_documents=True
+    # )
+    # system_prompt = (
+    #     "You are a helpful ... "
+    #     "If you don't know the answer, say you don't know. "
+    #     "... maximum and keep the answer concise. "
+    #     "Context: {context}"
+    # )
+    # prompt = ChatPromptTemplate.from_messages(
+    #     [
+    #         ("system", system_prompt),
+    #         ("human", "{input}"),
+    #     ]
+    # )
+    prompt = hub.pull("langchain-ai/retrieval-qa-chat")
+    question_answer_chain = create_stuff_documents_chain(LLM, prompt)
+    chain = create_retrieval_chain(retriever, question_answer_chain)
+    # chain.invoke({"input": query})
+    return chain

app/utils/handle_api.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+import getpass
+from dotenv import load_dotenv
+def get_api():
+    load_dotenv()
+    api_key = os.getenv("OPENAI_API_KEY")
+    if api_key == None:
+        api_key = getpass.getpass("Enter API key for OpenAI: ")
+    if api_key == None:
+        raise ValueError("API key is required. Please set it in the environment variable OPENAI_API_KEY or provide it directly.")
+    return api_key

app/utils/pdf2vector.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from app.managers import vector_manager as vm
+from app.ingest import ingest_fedlex_pdf
+import os
+if len(vm.load_vectorstore().docstore._dict.values()) == 0:
+    ingest_fedlex_pdf("data/fedlex-data-admin-ch-en.pdf")

build.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip install -r requirements.txt

data/fedlex-data-admin-ch-de.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36886540730b78fbe3495246937e7d08159d889cbb8f37b3ab90c1738d34109a
+size 1116300

data/fedlex-data-admin-ch-en.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e96201d59e3f932aeaf6a32309d98ca2ac4dc0d4bab3d01f1c9df6482edc78ed
+size 1057197

examples/questions.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Are non-EU Uzh International Masters students eligible to apply for a work permit after graduation?
2	+ I'm a non-EU student studying for a master's degree at the University of Zurich with a student B residence permit, can I legally work full-time during the summer breaks?

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+langchain
+openai
+faiss-cpu
+sentence-transformers
+PyPDF2
+gradio
+langchain-community
+langchain_huggingface
+langchain_openai
+pydantic
+ipykernel
+pypdf
+git-lfs

run.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from app import gradio_ui
+if __name__ == "__main__":
+    gradio_ui.iface.launch()

vectorstore/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e409000abe563b62994c4d34b7e51351337fae63943a7df672ad3f85316d5fce
+size 904749

vectorstore/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1441493373b74a67e26e39e2e3f0a953cc84b126be90c7e54805abe9b850d8bc
+size 386380