nanfangwuyu21 commited on
Commit
db81bb8
·
1 Parent(s): e98350b

Deutsch is now available, language can be switched, use different embedding models, ingest resources from two languages, also propmts, vector retrival supported for both languages now.

Browse files
app/config.py ADDED
@@ -0,0 +1 @@
 
 
1
+ language = "Deutsch" # "English"
app/gradio_ui.py CHANGED
@@ -1,45 +1,115 @@
1
  import gradio as gr
2
  from app.rag_chain import get_qa_chain
 
3
 
4
  def load_example_questions(path="examples/questions.txt"):
5
  with open(path, "r", encoding="utf-8") as f:
6
  questions = [line.strip() for line in f if line.strip()]
7
  return questions
8
 
9
- example_questions = load_example_questions()
 
 
 
 
10
 
 
 
11
 
12
- qa_chain = get_qa_chain()
13
-
14
- def ask_question(query):
15
- result = qa_chain.invoke({"input": query})
16
- # print(result)
17
  answer = result["answer"]
18
- sources = "\n\nSources:\n\n" + "\n".join([f"{doc.metadata['source']} {doc.metadata['article_number']}" for doc in result["context"]])
 
 
19
  return answer + sources
20
 
21
- # iface = gr.Interface(fn=ask_question,
22
- # inputs=gr.Textbox(label="Ask something about the document..."),
23
- # outputs=gr.Textbox(label="Answer with sources"),
24
- # title="LLM-Powered Legal RAG Demo")
25
-
26
  def from_example_dropdown(selected_example):
27
  return selected_example
28
 
29
- with gr.Blocks() as iface:
30
- gr.Markdown("## LLM-Powered Legal RAG Demo")
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
 
 
 
 
33
 
34
  with gr.Row():
35
  with gr.Column(scale=1):
36
- textbox = gr.Textbox(label="Ask something about the Federal Act on Foreign Nationals and Integration ... ", lines=2)
37
- dropdown = gr.Dropdown(label="📚 Choose an example question", choices=['']+example_questions)
38
- dropdown.change(fn=from_example_dropdown, inputs=dropdown, outputs=textbox)
39
- submit_btn = gr.Button("Submit")
40
-
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  with gr.Column(scale=1):
42
- output = gr.Textbox(label="Answer with sources", lines=12)
43
- # output = gr.Markdown(label="Answer with sources", container=True)
44
 
45
- submit_btn.click(fn=ask_question, inputs=textbox, outputs=output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from app.rag_chain import get_qa_chain
3
+ from app import config
4
 
5
  def load_example_questions(path="examples/questions.txt"):
6
  with open(path, "r", encoding="utf-8") as f:
7
  questions = [line.strip() for line in f if line.strip()]
8
  return questions
9
 
10
+ # Example questions in both languages
11
+ example_questions = {
12
+ "Deutsch": load_example_questions("examples/questions_de.txt"),
13
+ "English": load_example_questions("examples/questions_en.txt"),
14
+ }
15
 
16
+ def get_chain(language):
17
+ return get_qa_chain(language=language)
18
 
19
+ def ask_question(query, language):
20
+ qa_chain = get_chain(language)
21
+ result = qa_chain.invoke({"input": query})
 
 
22
  answer = result["answer"]
23
+ sources = "\n\nSources / Quellen:\n\n" + "\n".join(
24
+ [f"{doc.metadata['source']} {doc.metadata.get('article_number', '')}" for doc in result.get("context", [])]
25
+ )
26
  return answer + sources
27
 
 
 
 
 
 
28
  def from_example_dropdown(selected_example):
29
  return selected_example
30
 
31
+ def update_examples(language):
32
+ return gr.update(choices=[''] + example_questions[language], value='')
33
 
34
+ def get_texts(lang):
35
+ if lang == "Deutsch":
36
+ return {
37
+ "title": "## LLM-basiert Legal RAG Demo (basierend auf dem Bundesgesetz über die Ausländerinnen und Ausländer und über die Integration)",
38
+ "input_text": "Fragen Sie etwas zum Ausländer- und Integrationsgesetz ...",
39
+ "submit_button": "Absenden",
40
+ "output_text": "Antwort mit Quellen",
41
+ "lang_label": "🌐 Sprache",
42
+ "example_label": "Beispiel-Frage auswählen",
43
+ }
44
+ else:
45
+ return {
46
+ "title": "## LLM-Powered Legal RAG Demo (based on Federal Act on Foreign Nationals and Integration)",
47
+ "input_text": "Ask something about the Federal Act on Foreign Nationals and Integration ...",
48
+ "submit_button": "Submit",
49
+ "output_text": "Answer with sources",
50
+ "lang_label": "🌐 Language",
51
+ "example_label": "Choose an example question",
52
+ }
53
 
54
+ with gr.Blocks() as iface:
55
+ state = gr.State(config.language)
56
+ txt = get_texts(config.language)
57
+ title = gr.Markdown(txt["title"])
58
 
59
  with gr.Row():
60
  with gr.Column(scale=1):
61
+ lang_dropdown = gr.Dropdown(
62
+ label=txt["lang_label"],
63
+ choices=["Deutsch", "English"],
64
+ value=config.language
65
+ )
66
+ input_box = gr.Textbox(
67
+ label=txt["input_text"],
68
+ lines=2
69
+ )
70
+ example_dropdown = gr.Dropdown(
71
+ label=txt["example_label"],
72
+ choices=[''] + example_questions[config.language],
73
+ value=''
74
+ )
75
+ example_dropdown.change(fn=from_example_dropdown, inputs=example_dropdown, outputs=input_box)
76
+ lang_dropdown.change(fn=update_examples, inputs=lang_dropdown, outputs=example_dropdown)
77
+ submit_btn = gr.Button(txt["submit_button"])
78
+
79
  with gr.Column(scale=1):
80
+ output_box = gr.Textbox(label=txt["output_text"], lines=12)
81
+
82
 
83
+
84
+ submit_btn.click(
85
+ fn=ask_question,
86
+ inputs=[input_box, lang_dropdown],
87
+ outputs=output_box
88
+ )
89
+
90
+ def update_labels(lang):
91
+ txt = get_texts(lang)
92
+ return (
93
+ gr.update(value=txt["title"]),
94
+ gr.update(label=txt["input_text"]),
95
+ gr.update(label=txt["output_text"]),
96
+ gr.update(value=txt["submit_button"]),
97
+ gr.update(label=txt["lang_label"]),
98
+ gr.update(label=txt["example_label"], choices=[''] + example_questions[lang], value='')
99
+ )
100
+
101
+ lang_dropdown.change(
102
+ fn=update_labels,
103
+ inputs=lang_dropdown,
104
+ outputs=[
105
+ title,
106
+ input_box,
107
+ output_box,
108
+ submit_btn,
109
+ lang_dropdown,
110
+ example_dropdown,
111
+ ]
112
+ )
113
+
114
+ if __name__ == "__main__":
115
+ iface.launch()
app/ingest.py CHANGED
@@ -3,7 +3,7 @@ from langchain.schema import Document
3
  from app.managers import vector_manager as vm
4
  import os
5
 
6
- def ingest_fedlex_pdf(file_path):
7
  loader = PyMuPDFLoader(file_path)
8
  docs = loader.load()
9
  full_text = "\n".join([doc.page_content for doc in docs])
@@ -19,7 +19,7 @@ def ingest_fedlex_pdf(file_path):
19
  processed_docs.append(Document(page_content=chunk, metadata=metadata))
20
 
21
 
22
- vm.add_multi_documents(processed_docs)
23
  print(f"✅ Ingested {len(processed_docs)} articles from {file_path}")
24
 
25
 
 
3
  from app.managers import vector_manager as vm
4
  import os
5
 
6
+ def ingest_fedlex_pdf(file_path, language):
7
  loader = PyMuPDFLoader(file_path)
8
  docs = loader.load()
9
  full_text = "\n".join([doc.page_content for doc in docs])
 
19
  processed_docs.append(Document(page_content=chunk, metadata=metadata))
20
 
21
 
22
+ vm.add_multi_documents(processed_docs, language)
23
  print(f"✅ Ingested {len(processed_docs)} articles from {file_path}")
24
 
25
 
app/managers/vector_manager.py CHANGED
@@ -4,10 +4,19 @@ from typing import List, Optional
4
  from langchain_community.vectorstores import FAISS
5
  from langchain_community.docstore.in_memory import InMemoryDocstore
6
  from langchain.schema import Document
7
- from app.models.model import Embedding_model
 
8
 
9
  BASE_PATH = "vectorstore"
10
 
 
 
 
 
 
 
 
 
11
  def create_new_vectorstore(embedding_model):
12
  dim = len(embedding_model.embed_query("hello world"))
13
  index = faiss.IndexFlatL2(dim)
@@ -19,37 +28,40 @@ def create_new_vectorstore(embedding_model):
19
  )
20
 
21
 
22
- def load_vectorstore() -> FAISS:
23
- path = BASE_PATH
24
-
 
 
25
  if os.path.exists(os.path.join(path, 'index.faiss')):
26
  print("Reload existing faiss")
27
- return FAISS.load_local(path, Embedding_model, allow_dangerous_deserialization=True)
28
  else:
29
  print("Create new faiss")
30
- vs = create_new_vectorstore(Embedding_model)
31
- save_vectorstore(vs)
32
  return vs
33
 
34
 
35
- def save_vectorstore(vectorstore: FAISS):
36
- path = BASE_PATH
 
37
  vectorstore.save_local(path)
38
 
39
 
40
- def add_document(content: str, metadata: dict):
41
- vs = load_vectorstore()
 
42
  doc = Document(page_content=content, metadata=metadata)
43
  vs.add_documents([doc])
44
- save_vectorstore(vs)
45
 
46
- def add_multi_documents(processed_docs: list):
47
- vs = load_vectorstore()
48
  vs.add_documents(processed_docs)
49
- save_vectorstore(vs)
50
-
51
 
52
 
53
- def get_relevant_documents(query: str, top_k: int = 5) -> List[Document]:
54
- vs = load_vectorstore()
55
  return vs.similarity_search(query, k=top_k)
 
4
  from langchain_community.vectorstores import FAISS
5
  from langchain_community.docstore.in_memory import InMemoryDocstore
6
  from langchain.schema import Document
7
+ from app.models.model import Embedding_model_en, Embedding_model_de
8
+ from app import config
9
 
10
  BASE_PATH = "vectorstore"
11
 
12
+ VECTORSTORE_TYPES = {
13
+ "English": "English_index",
14
+ "Deutsch": "Deutsch_index"
15
+ }
16
+
17
+ def get_embedding_model(language):
18
+ return Embedding_model_en if language == "English" else Embedding_model_de
19
+
20
  def create_new_vectorstore(embedding_model):
21
  dim = len(embedding_model.embed_query("hello world"))
22
  index = faiss.IndexFlatL2(dim)
 
28
  )
29
 
30
 
31
+ def load_vectorstore(store_type: str) -> FAISS:
32
+ assert store_type in VECTORSTORE_TYPES, "Invalid vectorstore type."
33
+ path = os.path.join(BASE_PATH, VECTORSTORE_TYPES[store_type])
34
+
35
+ print(f"Load vectorstore from language {store_type}")
36
  if os.path.exists(os.path.join(path, 'index.faiss')):
37
  print("Reload existing faiss")
38
+ return FAISS.load_local(path, get_embedding_model(store_type), allow_dangerous_deserialization=True)
39
  else:
40
  print("Create new faiss")
41
+ vs = create_new_vectorstore(get_embedding_model(store_type))
42
+ save_vectorstore(vs, store_type)
43
  return vs
44
 
45
 
46
+
47
+ def save_vectorstore(vectorstore: FAISS, store_type: str):
48
+ path = os.path.join(BASE_PATH, VECTORSTORE_TYPES[store_type])
49
  vectorstore.save_local(path)
50
 
51
 
52
+ def add_document(content: str, metadata: dict, store_type: str):
53
+ assert store_type == metadata.get("type")
54
+ vs = load_vectorstore(store_type)
55
  doc = Document(page_content=content, metadata=metadata)
56
  vs.add_documents([doc])
57
+ save_vectorstore(vs, store_type)
58
 
59
+ def add_multi_documents(processed_docs: list, store_type: str):
60
+ vs = load_vectorstore(store_type)
61
  vs.add_documents(processed_docs)
62
+ save_vectorstore(vs, store_type)
 
63
 
64
 
65
+ def get_relevant_documents(store_type, query: str, top_k: int = 10) -> List[Document]:
66
+ vs = load_vectorstore(store_type)
67
  return vs.similarity_search(query, k=top_k)
app/models/model.py CHANGED
@@ -12,4 +12,6 @@ LLM = ChatOpenAI(
12
  temperature=0.2,
13
  max_tokens=None
14
  )
15
- Embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
 
12
  temperature=0.2,
13
  max_tokens=None
14
  )
15
+
16
+ Embedding_model_en = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # only English
17
+ Embedding_model_de = HuggingFaceEmbeddings(model_name="danielheinz/e5-base-sts-en-de")
app/rag_chain.py CHANGED
@@ -4,32 +4,19 @@ from langchain_core.prompts import ChatPromptTemplate
4
  from app.models.model import LLM
5
  from app.managers import vector_manager as vm
6
  from langchain import hub
 
 
7
 
8
- def get_qa_chain():
9
- db = vm.load_vectorstore()
10
  retriever = db.as_retriever(search_kwargs={"k": 5})
11
  # qa_chain = RetrievalQA.from_chain_type(
12
  # llm=LLM,
13
  # retriever=retriever,
14
  # return_source_documents=True
15
  # )
16
- system_prompt = """
17
- You are a helpful legal assistant. Use the provided legal document excerpts to carefully and clearly answer the user's question.
18
 
19
- INSTRUCTIONS:
20
- - Base your answer strictly on the DOCUMENT CONTEXT above.
21
- - Provide a detailed explanation that fully addresses the question.
22
- - For each claim or statement, cite the relevant source explicitly (e.g., "Art. 5" or "Art. 12").
23
- - You don't need to use all of the provided sources, but use the most relevant and helpful ones.
24
- - If the document context does not include enough information to answer the question, respond with: 'Sorry, I couldn't find sufficient information to answer your question.'
25
- - Do NOT make up facts or speculate beyond the document.
26
- - Be accurate, structured, and formal in tone.
27
-
28
- DOCUMENT CONTEXT:
29
- {context}
30
-
31
- QUESTION:
32
- """
33
 
34
  prompt = ChatPromptTemplate.from_messages(
35
  [
 
4
  from app.models.model import LLM
5
  from app.managers import vector_manager as vm
6
  from langchain import hub
7
+ from app.utils.prompts import prompts
8
+ from app.utils import pdf2vector
9
 
10
+ def get_qa_chain(language: str = "Deutsch"):
11
+ db = vm.load_vectorstore(language)
12
  retriever = db.as_retriever(search_kwargs={"k": 5})
13
  # qa_chain = RetrievalQA.from_chain_type(
14
  # llm=LLM,
15
  # retriever=retriever,
16
  # return_source_documents=True
17
  # )
 
 
18
 
19
+ system_prompt = prompts[language]
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  prompt = ChatPromptTemplate.from_messages(
22
  [
app/utils/pdf2vector.py CHANGED
@@ -2,5 +2,9 @@ from app.managers import vector_manager as vm
2
  from app.ingest import ingest_fedlex_pdf
3
  import os
4
 
5
- if len(vm.load_vectorstore().docstore._dict.values()) == 0:
6
- ingest_fedlex_pdf("data/fedlex-data-admin-ch-en.pdf")
 
 
 
 
 
2
  from app.ingest import ingest_fedlex_pdf
3
  import os
4
 
5
+ if len(vm.load_vectorstore("English").docstore._dict.values()) == 0:
6
+ ingest_fedlex_pdf("data/fedlex-data-admin-ch-en.pdf", "English")
7
+
8
+
9
+ if len(vm.load_vectorstore("Deutsch").docstore._dict.values()) == 0:
10
+ ingest_fedlex_pdf("data/fedlex-data-admin-ch-de.pdf", "Deutsch")
app/utils/prompts.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ prompts = {
3
+ 'English':
4
+ """
5
+ You are a helpful legal assistant. Use the provided legal document excerpts to carefully and clearly answer the user's question.
6
+
7
+ DOCUMENT CONTEXT:
8
+ {context}
9
+
10
+ INSTRUCTIONS:
11
+ - Base your answer strictly on the DOCUMENT CONTEXT above.
12
+ - Provide a detailed explanation that fully addresses the question.
13
+ - For each claim or statement, cite the relevant source explicitly (e.g., "Art. 5" or "Art. 12").
14
+ - You don't need to use all of the provided sources, but use the most relevant and helpful ones.
15
+ - If the document context does not include enough information to answer the question, respond with: 'Sorry, I couldn't find sufficient information to answer your question.'
16
+ - Do NOT make up facts or speculate beyond the document.
17
+ - Be accurate, structured, and formal in tone.
18
+
19
+ QUESTION:
20
+ """,
21
+
22
+ 'Deutsch':
23
+ """
24
+ Sie sind ein hilfreicher juristischer Assistent. Verwenden Sie die bereitgestellten Auszüge aus juristischen Dokumenten, um die Frage des Nutzers sorgfältig und klar zu beantworten.
25
+
26
+ DOKUMENTKONTEXT:
27
+ {context}
28
+
29
+ ANWEISUNGEN:
30
+ - Stützen Sie Ihre Antwort ausschließlich auf den obigen DOKUMENTKONTEXT.
31
+ - Geben Sie eine ausführliche Erklärung, die die Frage vollständig beantwortet.
32
+ - Zitieren Sie für jede Behauptung oder Aussage explizit die relevante Quelle (z. B. „Art. 5“ oder „Art. 12“).
33
+ - Sie müssen nicht alle bereitgestellten Quellen verwenden, sondern die relevantesten und hilfreichsten.
34
+ - Falls der Dokumentenkontext nicht genügend Informationen zur Beantwortung der Frage enthält, antworten Sie mit: „Entschuldigung, ich konnte keine ausreichenden Informationen finden, um Ihre Frage zu beantworten.“
35
+ - Erfinden Sie keine Fakten und spekulieren Sie nicht über den Dokumentenkontext hinaus.
36
+ - Seien Sie genau, strukturiert und verwenden Sie einen formellen Ton.
37
+
38
+ FRAGE:
39
+ """
40
+
41
+
42
+ }
examples/questions_de.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Unter welchen Bedingungen kann eine befristete Aufenthaltsbewilligung verlängert oder in eine unbefristete umgewandelt werden?
2
+ Können erwachsene Kinder oder ältere Eltern im Rahmen des Familiennachzugs in die Schweiz geholt werden?
3
+ Können internationale Masterstudierende aus Nicht-EU-Ländern nach dem Abschluss eine Arbeitserlaubnis beantragen?
4
+ Was sind die Voraussetzungen für den Familiennachzug nach dem Gesetz?
5
+ Welche verschiedenen Arten von Aufenthaltsbewilligungen gibt es für ausländische Staatsangehörige in der Schweiz?
6
+ Sind ausländische Staatsangehörige verpflichtet, im Rahmen der Integration eine Landessprache zu erlernen?
7
+ Welche Regeln gelten für ausländische Staatsangehörige, die in der Schweiz arbeiten?
examples/{questions.txt → questions_en.txt} RENAMED
@@ -1,7 +1,6 @@
1
- What are the criteria for obtaining a settlement permit (C permit)?
2
  Under what conditions can a temporary residence permit be extended or converted to a permanent one?
3
- Are non-EU International Masters students eligible to apply for a work permit after graduation?
4
  Can adult children or elderly parents be brought to Switzerland through family reunification?
 
5
  What are the requirements for family reunification under the Act?
6
  What are the different types of residence permits available for foreign nationals in Switzerland?
7
  Are foreign nationals required to learn a national language as part of integration?
 
 
1
  Under what conditions can a temporary residence permit be extended or converted to a permanent one?
 
2
  Can adult children or elderly parents be brought to Switzerland through family reunification?
3
+ Are non-EU International Masters students eligible to apply for a work permit after graduation?
4
  What are the requirements for family reunification under the Act?
5
  What are the different types of residence permits available for foreign nationals in Switzerland?
6
  Are foreign nationals required to learn a national language as part of integration?
vectorstore/Deutsch_index/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:806f6a712b5d2af2fbd2db390d724da5be4d1ba2628943ec9c3a5f13cc0bc19a
3
+ size 642093
vectorstore/Deutsch_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b198b4a9fc0d0670620b8d11ed64ede074465c56dae7cc9e46e7eeeb47769c7
3
+ size 346690
vectorstore/{index.faiss → English_index/index.faiss} RENAMED
File without changes
vectorstore/{index.pkl → English_index/index.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5c1d56fe5bb49fe416f5003ea84a1b6a640a310ac7c259e304348eb3ffb880e
3
  size 329788
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cea346b163c1ac12f4413a20c83207c1662c0a814c96cc4a0775e5a9e1f19e6d
3
  size 329788