Spaces:

MikeMann
/

PrototypGrundschutzChatbot

Paused

App Files Files Community

MikeMann commited on Dec 12, 2024

Commit

494b09f

1 Parent(s): 2c91b42

Retry

Browse files

Files changed (1) hide show

app.py +484 -0

app.py ADDED Viewed

	@@ -0,0 +1,484 @@

+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA initialization
+os.environ["allow_dangerous_deserialization"] = "True"
+print(os.getcwd())
+embedding_path="/home/user/app/docs/_embeddings/index.faiss"
+print(f"Loading FAISS index from: {embedding_path}")
+if not os.path.exists(embedding_path):
+    print("File not found!")
+HF_KEY=os.getenv('Gated_Repo')
+import spaces
+import time
+from typing import final
+import asyncio
+import torch
+import gradio as gr
+import threading
+import re
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.docstore import InMemoryDocstore
+from langchain_community.document_loaders import TextLoader
+from langchain.docstore.document import Document as LangchainDocument
+from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+from langchain_core.indexing import index
+from langchain_core.vectorstores import VectorStore
+from llama_index.core.node_parser import TextSplitter
+from llama_index.legacy.vector_stores import FaissVectorStore
+from pycparser.ply.yacc import token
+from ragatouille import RAGPretrainedModel
+from langchain_text_splitters import MarkdownHeaderTextSplitter, CharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+from sqlalchemy.testing.suite.test_reflection import metadata
+from sympy.solvers.diophantine.diophantine import length
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
+from transformers import pipeline
+#DEPR:from langchain.vectorstores import FAISS
+import faiss
+from langchain_community.vectorstores import FAISS
+#DEPR: from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores.utils import DistanceStrategy
+from huggingface_hub import login
+# Press Umschalt+F10 to execute it or replace it with your code.
+# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
+login(token=HF_KEY)
+vectorstore=None
+rerankingModel=None
+class BSIChatbot:
+    embedding_model = None
+    llmpipeline = None
+    llmtokenizer = None
+    vectorstore = None
+    streamer = None
+    images = [None]
+    # model_paths = {
+    #    'llm_path': 'meta-llama/Llama-3.2-3B-Instruct',
+    #    'embed_model_path': 'intfloat/multilingual-e5-large-instruct',
+    #    'rerank_model_path': 'domci/ColBERTv2-mmarco-de-0.1'
+    # }
+    llm_path = "meta-llama/Llama-3.2-3B-Instruct"
+    word_and_embed_model_path = "intfloat/multilingual-e5-large-instruct"
+    docs = "/home/user/app/docs"
+    #docs = "H:\\Uni\\Master\\Masterarbeit\\Masterarbeit\\daten\\_parsed_embed_test"
+    rerankModelPath="AdrienB134/ColBERTv1.0-german-mmarcoDE"
+    embedPath="/home/user/app/docs/_embeddings"
+    def __init__(self):
+        self.embedding_model = None
+        #self.vectorstore: VectorStore = None
+    def initializeEmbeddingModel(self, new_embedding):
+        global vectorstore
+        RAW_KNOWLEDGE_BASE = []
+        #Embedding, Vector generation and storing:
+        self.embedding_model = HuggingFaceEmbeddings(
+            model_name=self.word_and_embed_model_path,
+            multi_process=True,
+            model_kwargs={"device": "cuda"},
+            encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
+        )
+        #index_cpu = faiss.IndexFlatL2(1024)
+        #res = faiss.StandardGpuResources()
+        #index_gpu = faiss.index_cpu_to_gpu(res, 0, index_cpu)
+        dirList = os.listdir(self.docs)
+        if (new_embedding==True):
+            for doc in dirList:
+                print(doc)
+                if (".md" in doc):
+                    ##doctxt = TextLoader(docs + "\\" + doc).load()
+                    file = open(self.docs + "\\" + doc, 'r', encoding='utf-8')
+                    doctxt = file.read()
+                    RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content=doctxt, metadata={"source": doc}))
+                    file.close()
+                if (".txt" in doc):
+                    file = open(self.docs + "\\" + doc, 'r', encoding='cp1252')
+                    doctxt = file.read()
+                    if doc.replace(".txt",".png") in dirList:
+                        RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content=doctxt, metadata={"source": doc.replace(".txt",".png")}))
+                    if doc.replace(".txt",".jpg") in dirList:
+                        RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content=doctxt, metadata={"source": doc.replace(".txt",".jpg")}))
+                    file.close()
+                    # RAW_KNOWLEDGE_BASE.append(txtLoader)
+                    # print(RAW_KNOWLEDGE_BASE)
+            # Chunking starts here
+            headers_to_split_on = [
+                ("#", "Header 1"),
+                ("##", "Header 2"),
+                ("###", "Header 3"),
+                ("####", "Header 4"),
+                ("#####", "Header 5"),
+            ]
+            markdown_splitter = MarkdownHeaderTextSplitter(
+                headers_to_split_on=headers_to_split_on,
+                strip_headers=True
+            )
+            tokenizer = AutoTokenizer.from_pretrained(self.word_and_embed_model_path)
+            text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+                tokenizer=tokenizer,
+                chunk_size=512,  # The maximum number of words in a chunk
+                chunk_overlap=0,  # The number of characters to overlap between chunks
+                add_start_index=True,  # If `True`, includes chunk's start index in metadata
+                strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
+            )
+            ##Was macht man mit start Index herausfinden und wie metadata adden
+            docs_processed = []
+            for doc in RAW_KNOWLEDGE_BASE:
+                print(f"Word-Length in doc:{len(doc.page_content.split())}")
+                doc_cache = markdown_splitter.split_text(doc.page_content)
+                # print(f"Word-Length in doc_cache after MarkdownSplitter:{len(doc_cache.split())}")
+                doc_cache = text_splitter.split_documents(doc_cache)
+                # print(f"Word-Length in doc_cache after text_splitter:{len(doc_cache.split())}")
+                for chunk in doc_cache:
+                    chunk.metadata.update({"source": doc.metadata['source']})
+                    print(f"Chunk_Debug len: {len(chunk.page_content.split())} and Chunk:{chunk}")
+                # DEBUG:
+                # print(f"doc_cache after Metadata added:{doc_cache}\n")
+                docs_processed += doc_cache
+            #final_docs = []
+            #for doc in docs_processed:
+            #   final_docs += text_splitter.split_documents([doc])
+            #docs_processed = final_docs
+            ##Ab hier alt:
+            # MARKDOWN_SEPARATORS = [
+            #    "\n\n",
+            #    "---"
+            #    "\n",
+            #    " ",
+            #    ""
+            # ]
+            #text_splitter = RecursiveCharacterTextSplitter(
+            #    chunk_size=512,  # The maximum number of characters in a chunk
+            #    chunk_overlap=100,  # The number of characters to overlap between chunks
+            #    add_start_index=True,  # If `True`, includes chunk's start index in metadata
+            #    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
+            #    separators=MARKDOWN_SEPARATORS,
+            #)
+            #docs_processed = []
+            #for doc in RAW_KNOWLEDGE_BASE:
+            #    docs_processed += text_splitter.split_documents([doc])
+            print(f"Docs processed:{len(docs_processed)}")
+            # Max_Sequence_Length of e5 large instr = 512 Tokens
+            # Make sure the maximum length is below embedding size
+            lengths = [len(s.page_content) for s in docs_processed]
+            print(max(lengths))
+            #for l in docs_processed:
+            #    print(f"Char-Length:{len(l.page_content.split())}")
+            #    print(f"Tokenizer Length: {len(tokenizer.tokenize(l.page_content))}")
+            #if (max(lengths) > SentenceTransformer(self.word_and_embed_model_path).max_seq_length):
+            #    print(
+            #        f'Error: Fit chunking size into embedding model.. Chunk{max(lengths)} is bigger than {SentenceTransformer(self.word_and_embed_model_path).Max_Sequence_Length}')
+            start = time.time()
+            #docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(docs_processed)})
+            #index_to_docstore_id = {i: str(i) for i in range(len(docs_processed))}
+            vectorstore = FAISS.from_documents(docs_processed, self.embedding_model, distance_strategy=DistanceStrategy.COSINE)
+            #self.vectorstore = FAISS(
+            #    embedding_function=self.embedding_model,
+            #    index=index_gpu,
+            #    distance_strategy=DistanceStrategy.COSINE,
+            #    docstore=docstore,
+            #    index_to_docstore_id=index_to_docstore_id
+            #)
+            #self.vectorstore.from_documents(docs_processed, self.embedding_model)
+            #index_cpu = faiss.index_gpu_to_cpu(self.vectorstore.index)
+            #self.vectorstore.index = index_cpu
+            vectorstore.save_local(self.embedPath)
+            #self.vectorstore.index = index_gpu
+            end = time.time()
+            print("Saving Embeddings took", end-start, "seconds!")
+        else:
+            start = time.time()
+            vectorstore = FAISS.load_local(self.embedPath, self.embedding_model, allow_dangerous_deserialization=True)
+            #self.vectorstore.index = index_gpu
+            end = time.time()
+            print("Loading Embeddings took", end - start, "seconds!")
+    def retrieveSimiliarEmbedding(self, query):
+        global vectorstore
+        print("Retrieving Embeddings...")
+        start = time.time()
+        query = f"Instruct: Given a search query, retrieve the relevant passages that answer the query\nQuery:{query}"
+        #self.vectorstore.
+        #retrieved_chunks = self.vectorstore.similarity_search(query=query, k=20)
+        retrieved_chunks = vectorstore.similarity_search(query=query, k=20)
+        #finalchunks = []
+        #for chunk in retrieved_chunks:
+        #    if "---" not in chunk.page_content:
+        #        finalchunks.append(chunk)
+        #retrieved_chunks = finalchunks
+        end = time.time()
+        print("Retrieving Chunks with similiar embeddings took", end - start, "seconds!")
+        #print("\n==================================Top document==================================")
+        #print(retrieved_chunks[0].page_content)
+        #print(retrieved_chunks[1].page_content)
+        #print(retrieved_chunks[2].page_content)
+        #print("==================================Metadata==================================")
+        #print(retrieved_chunks[0].metadata)
+        #print(retrieved_chunks[1].metadata)
+        #print(retrieved_chunks[2].metadata)
+        print(f"printing first chunk to see whats inside: {retrieved_chunks[0]}")
+        return retrieved_chunks
+    def initializeLLM(self):
+        bnb_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+            #bnb_8bit_use_double_quant=True,
+            #bnb_8bit_quant_type="nf4",
+            #bnb_8bit_compute_dtype=torch.bfloat16,
+        )
+        llm = AutoModelForCausalLM.from_pretrained(
+            self.llm_path, quantization_config=bnb_config
+        )
+        self.llmtokenizer = AutoTokenizer.from_pretrained(self.llm_path)
+        self.streamer=TextIteratorStreamer(self.llmtokenizer, skip_prompt=True)
+        self.llmpipeline = pipeline(
+            model=llm,
+            tokenizer=self.llmtokenizer,
+            task="text-generation",
+            do_sample=True,
+            temperature=0.7,
+            repetition_penalty=1.1,
+            return_full_text=False,
+            streamer=self.streamer,
+            max_new_tokens=500,
+        )
+    def queryLLM(self,query):
+        #resp = self.llmpipeline(chat) Fixen
+        return(self.llmpipeline(query)[0]["generated_text"])
+    def initializeRerankingModel(self):
+        global rerankingModel
+        rerankingModel = RAGPretrainedModel.from_pretrained(self.rerankModelPath)
+    @spaces.GPU
+    def ragPrompt(self, query, rerankingStep, history):
+        prompt_in_chat_format = [
+            {
+                "role": "system",
+                "content": """You are an helpful Chatbot for the BSI IT-Grundschutz. Using the information contained in the context,
+                give a comprehensive answer to the question.
+                Respond only to the question asked, response should be concise and relevant but also give some context to the question.
+                Provide the source document when relevant for the understanding.
+                If the answer cannot be deduced from the context, do not give an answer.""",
+            },
+            {
+                "role": "user",
+                "content": """Context:
+                {context}
+                ---
+                Chat-History:
+                {history}
+                ---
+                Now here is the question you need to answer.
+                Question: {question}""",
+            },
+        ]
+        RAG_PROMPT_TEMPLATE = self.llmtokenizer.apply_chat_template(
+            prompt_in_chat_format, tokenize=False, add_generation_prompt=True
+        )
+        retrieved_chunks = self.retrieveSimiliarEmbedding(query)
+        retrieved_chunks_text = []
+        #TODO Irgendwas stimmt hier mit den Listen nicht
+        for chunk in retrieved_chunks:
+            #TODO Hier noch was smarteres Überlegen für alle Header
+            if "Header 1" in chunk.metadata.keys():
+                retrieved_chunks_text.append(f"The Document is: '{chunk.metadata['source']}'\nHeader of the Section is: '{chunk.metadata['Header 1']}' and Content of it:{chunk.page_content}")
+            else:
+                retrieved_chunks_text.append(
+                    f"The Document is: '{chunk.metadata['source']}'\nImage Description is: ':{chunk.page_content}")
+        i=1
+        for chunk in retrieved_chunks_text:
+            print(f"Retrieved Chunk number {i}:\n{chunk}")
+            i=i+1
+        if rerankingStep==True:
+            if rerankingModel == None:
+                print ("initializing Reranker-Model..")
+                self.initializeRerankingModel()
+            print("Starting Reranking Chunks...")
+            rerankingModel
+            retrieved_chunks_text=self.rerankingModel.rerank(query, retrieved_chunks_text,k=5)
+            retrieved_chunks_text=[chunk["content"] for chunk in retrieved_chunks_text]
+            i = 1
+            for chunk in retrieved_chunks_text:
+                print(f"Reranked Chunk number {i}:\n{chunk}")
+                i = i + 1
+        context = "\nExtracted documents:\n"
+        context += "".join([doc for i, doc in enumerate(retrieved_chunks_text)])
+        #Alles außer letzte Useranfrage
+        final_prompt = RAG_PROMPT_TEMPLATE.format(
+            question=query, context=context, history=history[:-1]
+        )
+        print(f"Query:\n{final_prompt}")
+        pattern = r"Filename:(.*?);"
+        match = re.findall(pattern, final_prompt)
+        self.images=match
+        #queryModel = HuggingFacePipeline(pipeline = self.llmpipeline)
+        generation_thread = threading.Thread(target=self.llmpipeline, args=(final_prompt,))
+        generation_thread.start()
+        return self.streamer
+        #answer=self.queryLLM(final_prompt)
+        #answer = self.llmpipeline(final_prompt)
+        #for token in answer:
+        #    print (token["generated_text"])
+        #    yield token["generated_text"]
+        # gen = queryModel.stream(final_prompt)
+        #return gen
+        #print (f"Answer:\n{answer}")
+    def returnImages(self):
+        imageList = []
+        for image in self.images:
+            imageList.append(f"{self.docs}\\{image}")
+        return imageList
+    def launchGr(self):
+        gr.Interface.from_pipeline(self.llmpipeline).launch()
+if __name__ == '__main__':
+    #RAW_KNOWLEDGE_BASE = []
+    #RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content="1Text", metadata={"source": "bb"}))
+    #RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content="2Text", metadata={"source": "aa"}))
+    #RAW_KNOWLEDGE_BASE[0].metadata.update({"NeuerKey":"White"})
+    #print(RAW_KNOWLEDGE_BASE)
+    #time.sleep(10)
+    #{doc.page_content} [{doc.metadata}] => aktuellen Header in jeden Chunk embedden; Doc.Metadata retrieven
+    renewEmbeddings = False
+    reranking = True
+    bot = BSIChatbot()
+    bot.initializeEmbeddingModel(renewEmbeddings)
+    if reranking == True:
+        bot.initializeRerankingModel()
+    #TODO: DEBUG:
+    #bot.retrieveSimiliarEmbedding("Was ist der IT-Grundschutz?")
+    #TODO: DEBUG:
+    #time.sleep(10)
+    bot.initializeLLM()
+    #bot.retrieveSimiliarEmbedding("Welche Typen von Anforderungen gibt es im IT-Grundschutz?")
+    #bot.queryLLM("Welche Typen von Anforderungen gibt es im IT-Grundschutz?")
+    #bot.ragPrompt("""
+    #Welche Informationen beinhaltet die IT-Grundschutz-Methodik (BSI-Standard 200-2)? Wähle aus den folgenden Antwortmöglichkeiten (mehrere können richtig sein!):
+    #A: besonders schutzwürdigen Komponenten,
+    #B: methodische Hilfestellungen zur schrittweisen Einführung eines ISMS,
+    #C: wie die Informationssicherheit im laufenden Betrieb aufrechterhalten und kontinuierlich verbessert werden kann,
+    #D: effiziente Verfahren, um die allgemeinen Anforderungen des BSI-Standards 200-1 zu konkretisieren
+    #""", True)
+    #bot.launchGr()
+    with gr.Blocks() as demo:
+        with gr.Row() as row:
+                with gr.Column(scale=3):
+                    chatbot = gr.Chatbot(type="messages")
+                    msg = gr.Textbox()
+                    clear = gr.Button("Clear")
+                    reset = gr.Button("Reset")
+                with gr.Column(scale=1):  # Bildergalerie
+                    gallery = gr.Gallery(label="Bildergalerie",elem_id="gallery")
+        def user(user_message, history: list):
+            return "", history + [{"role": "user", "content": user_message}]
+        def returnImages():
+            # Hier holen wir uns die Bildpfade und wandeln sie in gr.Image-Objekte um
+            image_paths = bot.returnImages()
+            print(f"returning images: {image_paths}")
+            return image_paths
+        def gradiobot(history: list):
+            start = time.time()
+            print(f"ragQuery hist -1:{history[-1].get('content')}")
+            print(f"ragQuery hist 0:{history[0].get('content')}")
+            print(f"fullHistory: {history}" )
+            bot_response = bot.ragPrompt(history[-1].get('content'), reranking, history)
+            history.append({"role": "assistant", "content": ""})
+            image_gallery = returnImages()
+            for token in bot_response:
+                if "eot_id" in token:
+                    token = token.replace("<|eot_id|>","")
+                if token.startswith("-"):
+                    token = f"\n{token}"
+                if re.match(r"^[1-9]\.",token):
+                    token = f"\n{token}"
+                history[-1]['content'] += token
+                yield history, image_gallery
+            end = time.time()
+            print("End2End Query took", end - start, "seconds!")
+        def resetHistory():
+            return []
+        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+            gradiobot, inputs=[chatbot], outputs=[chatbot, gallery]
+        )
+        clear.click(lambda: None, None, chatbot, queue=False)
+        reset.click(resetHistory, outputs=chatbot, queue=False)
+    demo.css = """
+        #gallery {
+            display: grid;
+            grid-template-columns: repeat(2, 1fr);
+            gap: 10px;
+            height: 400px;
+            overflow: auto;
+        }
+    """
+    demo.launch(allowed_paths=["/home/user/app/docs"])
+    #Answer: B, C und D => Korrekt!
+# See PyCharm help at https://www.jetbrains.com/help/pycharm/