Spaces:

tongyi21
/

LLM_Powered_Legal_RAG

Sleeping

App Files Files Community

nanfangwuyu21 commited on May 19

Commit

db81bb8

1 Parent(s): e98350b

Deutsch is now available, language can be switched, use different embedding models, ingest resources from two languages, also propmts, vector retrival supported for both languages now.

Browse files

Files changed (14) hide show

app/config.py +1 -0
app/gradio_ui.py +92 -22
app/ingest.py +2 -2
app/managers/vector_manager.py +30 -18
app/models/model.py +3 -1
app/rag_chain.py +5 -18
app/utils/pdf2vector.py +6 -2
app/utils/prompts.py +42 -0
examples/questions_de.txt +7 -0
examples/{questions.txt → questions_en.txt} +1 -2
vectorstore/Deutsch_index/index.faiss +3 -0
vectorstore/Deutsch_index/index.pkl +3 -0
vectorstore/{index.faiss → English_index/index.faiss} +0 -0
vectorstore/{index.pkl → English_index/index.pkl} +1 -1

app/config.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ language = "Deutsch" # "English"

app/gradio_ui.py CHANGED Viewed

@@ -1,45 +1,115 @@
 import gradio as gr
 from app.rag_chain import get_qa_chain
 def load_example_questions(path="examples/questions.txt"):
     with open(path, "r", encoding="utf-8") as f:
         questions = [line.strip() for line in f if line.strip()]
     return questions
-example_questions = load_example_questions()
-qa_chain = get_qa_chain()
-def ask_question(query):
-    result = qa_chain.invoke({"input": query})
-    # print(result)
     answer = result["answer"]
-    sources = "\n\nSources:\n\n" + "\n".join([f"{doc.metadata['source']} {doc.metadata['article_number']}" for doc in result["context"]])
     return answer + sources
-# iface = gr.Interface(fn=ask_question,
-#                      inputs=gr.Textbox(label="Ask something about the document..."),
-#                      outputs=gr.Textbox(label="Answer with sources"),
-#                      title="LLM-Powered Legal RAG Demo")
 def from_example_dropdown(selected_example):
     return selected_example
-with gr.Blocks() as iface:
-    gr.Markdown("## LLM-Powered Legal RAG Demo")
     with gr.Row():
         with gr.Column(scale=1):
-            textbox = gr.Textbox(label="Ask something about the Federal Act on Foreign Nationals and Integration ... ", lines=2)
-            dropdown = gr.Dropdown(label="📚 Choose an example question", choices=['']+example_questions)
-            dropdown.change(fn=from_example_dropdown, inputs=dropdown, outputs=textbox)
-            submit_btn = gr.Button("Submit")
         with gr.Column(scale=1):
-            output = gr.Textbox(label="Answer with sources", lines=12)
-            # output = gr.Markdown(label="Answer with sources", container=True)
-    submit_btn.click(fn=ask_question, inputs=textbox, outputs=output)

 import gradio as gr
 from app.rag_chain import get_qa_chain
+from app import config
 def load_example_questions(path="examples/questions.txt"):
     with open(path, "r", encoding="utf-8") as f:
         questions = [line.strip() for line in f if line.strip()]
     return questions
+# Example questions in both languages
+example_questions = {
+    "Deutsch": load_example_questions("examples/questions_de.txt"),
+    "English": load_example_questions("examples/questions_en.txt"),
+}
+def get_chain(language):
+    return get_qa_chain(language=language)
+def ask_question(query, language):
+    qa_chain = get_chain(language)
+    result = qa_chain.invoke({"input": query})
     answer = result["answer"]
+    sources = "\n\nSources / Quellen:\n\n" + "\n".join(
+        [f"{doc.metadata['source']} {doc.metadata.get('article_number', '')}" for doc in result.get("context", [])]
+    )
     return answer + sources
 def from_example_dropdown(selected_example):
     return selected_example
+def update_examples(language):
+    return gr.update(choices=[''] + example_questions[language], value='')
+def get_texts(lang):
+    if lang == "Deutsch":
+        return {
+            "title": "## LLM-basiert Legal RAG Demo (basierend auf dem Bundesgesetz über die Ausländerinnen und Ausländer und über die Integration)",
+            "input_text": "Fragen Sie etwas zum Ausländer- und Integrationsgesetz ...",
+            "submit_button": "Absenden",
+            "output_text": "Antwort mit Quellen",
+            "lang_label": "🌐 Sprache",
+            "example_label": "Beispiel-Frage auswählen",
+        }
+    else:
+        return {
+            "title": "## LLM-Powered Legal RAG Demo (based on Federal Act on Foreign Nationals and Integration)",
+            "input_text": "Ask something about the Federal Act on Foreign Nationals and Integration ...",
+            "submit_button": "Submit",
+            "output_text": "Answer with sources",
+            "lang_label": "🌐 Language",
+            "example_label": "Choose an example question",
+        }
+with gr.Blocks() as iface:
+    state = gr.State(config.language)
+    txt = get_texts(config.language)
+    title = gr.Markdown(txt["title"])
     with gr.Row():
         with gr.Column(scale=1):
+            lang_dropdown = gr.Dropdown(
+                label=txt["lang_label"],
+                choices=["Deutsch", "English"],
+                value=config.language
+            )
+            input_box = gr.Textbox(
+                label=txt["input_text"],
+                lines=2
+            )
+            example_dropdown = gr.Dropdown(
+                label=txt["example_label"],
+                choices=[''] + example_questions[config.language],
+                value=''
+            )
+            example_dropdown.change(fn=from_example_dropdown, inputs=example_dropdown, outputs=input_box)
+            lang_dropdown.change(fn=update_examples, inputs=lang_dropdown, outputs=example_dropdown)
+            submit_btn = gr.Button(txt["submit_button"])
         with gr.Column(scale=1):
+            output_box = gr.Textbox(label=txt["output_text"], lines=12)
+    submit_btn.click(
+        fn=ask_question,
+        inputs=[input_box, lang_dropdown],
+        outputs=output_box
+    )
+    def update_labels(lang):
+        txt = get_texts(lang)
+        return (
+            gr.update(value=txt["title"]),
+            gr.update(label=txt["input_text"]),
+            gr.update(label=txt["output_text"]),
+            gr.update(value=txt["submit_button"]),
+            gr.update(label=txt["lang_label"]),
+            gr.update(label=txt["example_label"], choices=[''] + example_questions[lang], value='')
+        )
+    lang_dropdown.change(
+        fn=update_labels,
+        inputs=lang_dropdown,
+        outputs=[
+            title,
+            input_box,
+            output_box,
+            submit_btn,
+            lang_dropdown,
+            example_dropdown,
+        ]
+    )
+if __name__ == "__main__":
+    iface.launch()

app/ingest.py CHANGED Viewed

@@ -3,7 +3,7 @@ from langchain.schema import Document
 from app.managers import vector_manager as vm
 import os
-def ingest_fedlex_pdf(file_path):
     loader = PyMuPDFLoader(file_path)
     docs = loader.load()
     full_text = "\n".join([doc.page_content for doc in docs])
@@ -19,7 +19,7 @@ def ingest_fedlex_pdf(file_path):
         processed_docs.append(Document(page_content=chunk, metadata=metadata))
-    vm.add_multi_documents(processed_docs)
     print(f"✅ Ingested {len(processed_docs)} articles from {file_path}")

 from app.managers import vector_manager as vm
 import os
+def ingest_fedlex_pdf(file_path, language):
     loader = PyMuPDFLoader(file_path)
     docs = loader.load()
     full_text = "\n".join([doc.page_content for doc in docs])
         processed_docs.append(Document(page_content=chunk, metadata=metadata))
+    vm.add_multi_documents(processed_docs, language)
     print(f"✅ Ingested {len(processed_docs)} articles from {file_path}")

app/managers/vector_manager.py CHANGED Viewed

@@ -4,10 +4,19 @@ from typing import List, Optional
 from langchain_community.vectorstores import FAISS
 from langchain_community.docstore.in_memory import InMemoryDocstore
 from langchain.schema import Document
-from app.models.model import Embedding_model
 BASE_PATH = "vectorstore"
 def create_new_vectorstore(embedding_model):
     dim = len(embedding_model.embed_query("hello world"))
     index = faiss.IndexFlatL2(dim)
@@ -19,37 +28,40 @@ def create_new_vectorstore(embedding_model):
     )
-def load_vectorstore() -> FAISS:
-    path = BASE_PATH
     if os.path.exists(os.path.join(path, 'index.faiss')):
         print("Reload existing faiss")
-        return FAISS.load_local(path, Embedding_model, allow_dangerous_deserialization=True)
     else:
         print("Create new faiss")
-        vs = create_new_vectorstore(Embedding_model)
-        save_vectorstore(vs)
         return vs
-def save_vectorstore(vectorstore: FAISS):
-    path = BASE_PATH
     vectorstore.save_local(path)
-def add_document(content: str, metadata: dict):
-    vs = load_vectorstore()
     doc = Document(page_content=content, metadata=metadata)
     vs.add_documents([doc])
-    save_vectorstore(vs)
-def add_multi_documents(processed_docs: list):
-    vs = load_vectorstore()
     vs.add_documents(processed_docs)
-    save_vectorstore(vs)
-def get_relevant_documents(query: str, top_k: int = 5) -> List[Document]:
-    vs = load_vectorstore()
     return vs.similarity_search(query, k=top_k)

 from langchain_community.vectorstores import FAISS
 from langchain_community.docstore.in_memory import InMemoryDocstore
 from langchain.schema import Document
+from app.models.model import Embedding_model_en, Embedding_model_de
+from app import config
 BASE_PATH = "vectorstore"
+VECTORSTORE_TYPES = {
+    "English": "English_index",
+    "Deutsch": "Deutsch_index"
+}
+def get_embedding_model(language):
+    return Embedding_model_en if language == "English" else Embedding_model_de
 def create_new_vectorstore(embedding_model):
     dim = len(embedding_model.embed_query("hello world"))
     index = faiss.IndexFlatL2(dim)
     )
+def load_vectorstore(store_type: str) -> FAISS:
+    assert store_type in VECTORSTORE_TYPES, "Invalid vectorstore type."
+    path = os.path.join(BASE_PATH, VECTORSTORE_TYPES[store_type])
+    print(f"Load vectorstore from language {store_type}")
     if os.path.exists(os.path.join(path, 'index.faiss')):
         print("Reload existing faiss")
+        return FAISS.load_local(path, get_embedding_model(store_type), allow_dangerous_deserialization=True)
     else:
         print("Create new faiss")
+        vs = create_new_vectorstore(get_embedding_model(store_type))
+        save_vectorstore(vs, store_type)
         return vs
+def save_vectorstore(vectorstore: FAISS, store_type: str):
+    path = os.path.join(BASE_PATH, VECTORSTORE_TYPES[store_type])
     vectorstore.save_local(path)
+def add_document(content: str, metadata: dict, store_type: str):
+    assert store_type == metadata.get("type")
+    vs = load_vectorstore(store_type)
     doc = Document(page_content=content, metadata=metadata)
     vs.add_documents([doc])
+    save_vectorstore(vs, store_type)
+def add_multi_documents(processed_docs: list, store_type: str):
+    vs = load_vectorstore(store_type)
     vs.add_documents(processed_docs)
+    save_vectorstore(vs, store_type)
+def get_relevant_documents(store_type, query: str, top_k: int = 10) -> List[Document]:
+    vs = load_vectorstore(store_type)
     return vs.similarity_search(query, k=top_k)

app/models/model.py CHANGED Viewed

@@ -12,4 +12,6 @@ LLM = ChatOpenAI(
     temperature=0.2,
     max_tokens=None
     )
-Embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

     temperature=0.2,
     max_tokens=None
     )
+Embedding_model_en = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # only English
+Embedding_model_de = HuggingFaceEmbeddings(model_name="danielheinz/e5-base-sts-en-de")

app/rag_chain.py CHANGED Viewed

@@ -4,32 +4,19 @@ from langchain_core.prompts import ChatPromptTemplate
 from app.models.model import LLM
 from app.managers import vector_manager as vm
 from langchain import hub
-def get_qa_chain():
-    db = vm.load_vectorstore()
     retriever = db.as_retriever(search_kwargs={"k": 5})
     # qa_chain = RetrievalQA.from_chain_type(
     #     llm=LLM,
     #     retriever=retriever,
     #     return_source_documents=True
     # )
-    system_prompt = """
-You are a helpful legal assistant. Use the provided legal document excerpts to carefully and clearly answer the user's question.
-INSTRUCTIONS:
-- Base your answer strictly on the DOCUMENT CONTEXT above.
-- Provide a detailed explanation that fully addresses the question.
-- For each claim or statement, cite the relevant source explicitly (e.g., "Art. 5" or "Art. 12").
-- You don't need to use all of the provided sources, but use the most relevant and helpful ones.
-- If the document context does not include enough information to answer the question, respond with: 'Sorry, I couldn't find sufficient information to answer your question.'
-- Do NOT make up facts or speculate beyond the document.
-- Be accurate, structured, and formal in tone.
-DOCUMENT CONTEXT:
-{context}
-QUESTION:
-"""
     prompt = ChatPromptTemplate.from_messages(
         [

 from app.models.model import LLM
 from app.managers import vector_manager as vm
 from langchain import hub
+from app.utils.prompts import prompts
+from app.utils import pdf2vector
+def get_qa_chain(language: str = "Deutsch"):
+    db = vm.load_vectorstore(language)
     retriever = db.as_retriever(search_kwargs={"k": 5})
     # qa_chain = RetrievalQA.from_chain_type(
     #     llm=LLM,
     #     retriever=retriever,
     #     return_source_documents=True
     # )
+    system_prompt = prompts[language]
     prompt = ChatPromptTemplate.from_messages(
         [

app/utils/pdf2vector.py CHANGED Viewed

@@ -2,5 +2,9 @@ from app.managers import vector_manager as vm
 from app.ingest import ingest_fedlex_pdf
 import os
-if len(vm.load_vectorstore().docstore._dict.values()) == 0:
-    ingest_fedlex_pdf("data/fedlex-data-admin-ch-en.pdf")

 from app.ingest import ingest_fedlex_pdf
 import os
+if len(vm.load_vectorstore("English").docstore._dict.values()) == 0:
+    ingest_fedlex_pdf("data/fedlex-data-admin-ch-en.pdf", "English")
+if len(vm.load_vectorstore("Deutsch").docstore._dict.values()) == 0:
+    ingest_fedlex_pdf("data/fedlex-data-admin-ch-de.pdf", "Deutsch")

app/utils/prompts.py ADDED Viewed

	@@ -0,0 +1,42 @@

+prompts = {
+    'English':
+    """
+You are a helpful legal assistant. Use the provided legal document excerpts to carefully and clearly answer the user's question.
+DOCUMENT CONTEXT:
+{context}
+INSTRUCTIONS:
+- Base your answer strictly on the DOCUMENT CONTEXT above.
+- Provide a detailed explanation that fully addresses the question.
+- For each claim or statement, cite the relevant source explicitly (e.g., "Art. 5" or "Art. 12").
+- You don't need to use all of the provided sources, but use the most relevant and helpful ones.
+- If the document context does not include enough information to answer the question, respond with: 'Sorry, I couldn't find sufficient information to answer your question.'
+- Do NOT make up facts or speculate beyond the document.
+- Be accurate, structured, and formal in tone.
+QUESTION:
+""",
+    'Deutsch':
+        """
+    Sie sind ein hilfreicher juristischer Assistent. Verwenden Sie die bereitgestellten Auszüge aus juristischen Dokumenten, um die Frage des Nutzers sorgfältig und klar zu beantworten.
+    DOKUMENTKONTEXT:
+    {context}
+    ANWEISUNGEN:
+    - Stützen Sie Ihre Antwort ausschließlich auf den obigen DOKUMENTKONTEXT.
+    - Geben Sie eine ausführliche Erklärung, die die Frage vollständig beantwortet.
+    - Zitieren Sie für jede Behauptung oder Aussage explizit die relevante Quelle (z. B. „Art. 5“ oder „Art. 12“).
+    - Sie müssen nicht alle bereitgestellten Quellen verwenden, sondern die relevantesten und hilfreichsten.
+    - Falls der Dokumentenkontext nicht genügend Informationen zur Beantwortung der Frage enthält, antworten Sie mit: „Entschuldigung, ich konnte keine ausreichenden Informationen finden, um Ihre Frage zu beantworten.“
+    - Erfinden Sie keine Fakten und spekulieren Sie nicht über den Dokumentenkontext hinaus.
+    - Seien Sie genau, strukturiert und verwenden Sie einen formellen Ton.
+    FRAGE:
+    """
+}

examples/questions_de.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+Unter welchen Bedingungen kann eine befristete Aufenthaltsbewilligung verlängert oder in eine unbefristete umgewandelt werden?
+Können erwachsene Kinder oder ältere Eltern im Rahmen des Familiennachzugs in die Schweiz geholt werden?
+Können internationale Masterstudierende aus Nicht-EU-Ländern nach dem Abschluss eine Arbeitserlaubnis beantragen?
+Was sind die Voraussetzungen für den Familiennachzug nach dem Gesetz?
+Welche verschiedenen Arten von Aufenthaltsbewilligungen gibt es für ausländische Staatsangehörige in der Schweiz?
+Sind ausländische Staatsangehörige verpflichtet, im Rahmen der Integration eine Landessprache zu erlernen?
+Welche Regeln gelten für ausländische Staatsangehörige, die in der Schweiz arbeiten?

examples/{questions.txt → questions_en.txt} RENAMED Viewed

@@ -1,7 +1,6 @@
-What are the criteria for obtaining a settlement permit (C permit)?
 Under what conditions can a temporary residence permit be extended or converted to a permanent one?
-Are non-EU International Masters students eligible to apply for a work permit after graduation?
 Can adult children or elderly parents be brought to Switzerland through family reunification?
 What are the requirements for family reunification under the Act?
 What are the different types of residence permits available for foreign nationals in Switzerland?
 Are foreign nationals required to learn a national language as part of integration?

 Under what conditions can a temporary residence permit be extended or converted to a permanent one?
 Can adult children or elderly parents be brought to Switzerland through family reunification?
+Are non-EU International Masters students eligible to apply for a work permit after graduation?
 What are the requirements for family reunification under the Act?
 What are the different types of residence permits available for foreign nationals in Switzerland?
 Are foreign nationals required to learn a national language as part of integration?

vectorstore/Deutsch_index/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:806f6a712b5d2af2fbd2db390d724da5be4d1ba2628943ec9c3a5f13cc0bc19a
+size 642093

vectorstore/Deutsch_index/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b198b4a9fc0d0670620b8d11ed64ede074465c56dae7cc9e46e7eeeb47769c7
+size 346690

vectorstore/{index.faiss → English_index/index.faiss} RENAMED Viewed

File without changes

vectorstore/{index.pkl → English_index/index.pkl} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a5c1d56fe5bb49fe416f5003ea84a1b6a640a310ac7c259e304348eb3ffb880e
 size 329788

 version https://git-lfs.github.com/spec/v1
+oid sha256:cea346b163c1ac12f4413a20c83207c1662c0a814c96cc4a0775e5a9e1f19e6d
 size 329788