Spaces:

MikeMann
/

PrototypGrundschutzChatbot

Paused

App Files Files Community

MikeMann commited on Dec 12, 2024

Commit

2c91b42

1 Parent(s): e9f7097

Retry

Browse files

Files changed (1) hide show

app.py +0 -483

app.py DELETED Viewed

@@ -1,483 +0,0 @@
-import os
-os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA initialization
-os.environ["allow_dangerous_deserialization"] = "True"
-print(os.getcwd())
-embedding_path="/home/user/app/docs/_embeddings/index.faiss"
-print(f"Loading FAISS index from: {embedding_path}")
-if not os.path.exists(embedding_path):
-    print("File not found!")
-HF_KEY=os.getenv('Gated_Repo')
-import spaces
-import time
-from typing import final
-import asyncio
-import torch
-import gradio as gr
-import threading
-import re
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.docstore import InMemoryDocstore
-from langchain_community.document_loaders import TextLoader
-from langchain.docstore.document import Document as LangchainDocument
-from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
-from langchain_core.indexing import index
-from langchain_core.vectorstores import VectorStore
-from llama_index.core.node_parser import TextSplitter
-from llama_index.legacy.vector_stores import FaissVectorStore
-from pycparser.ply.yacc import token
-from ragatouille import RAGPretrainedModel
-from langchain_text_splitters import MarkdownHeaderTextSplitter, CharacterTextSplitter
-from sentence_transformers import SentenceTransformer
-from sqlalchemy.testing.suite.test_reflection import metadata
-from sympy.solvers.diophantine.diophantine import length
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
-from transformers import pipeline
-#DEPR:from langchain.vectorstores import FAISS
-import faiss
-from langchain_community.vectorstores import FAISS
-#DEPR: from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_community.vectorstores.utils import DistanceStrategy
-from huggingface_hub import login
-# Press Umschalt+F10 to execute it or replace it with your code.
-# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
-login(token=HF_KEY)
-vectorstore=None
-class BSIChatbot:
-    embedding_model = None
-    llmpipeline = None
-    llmtokenizer = None
-    vectorstore = None
-    rerankingModel = None
-    streamer = None
-    images = [None]
-    # model_paths = {
-    #    'llm_path': 'meta-llama/Llama-3.2-3B-Instruct',
-    #    'embed_model_path': 'intfloat/multilingual-e5-large-instruct',
-    #    'rerank_model_path': 'domci/ColBERTv2-mmarco-de-0.1'
-    # }
-    llm_path = "meta-llama/Llama-3.2-3B-Instruct"
-    word_and_embed_model_path = "intfloat/multilingual-e5-large-instruct"
-    docs = "/home/user/app/docs"
-    #docs = "H:\\Uni\\Master\\Masterarbeit\\Masterarbeit\\daten\\_parsed_embed_test"
-    rerankModelPath="AdrienB134/ColBERTv1.0-german-mmarcoDE"
-    embedPath="/home/user/app/docs/_embeddings"
-    def __init__(self):
-        self.embedding_model = None
-        #self.vectorstore: VectorStore = None
-    def initializeEmbeddingModel(self, new_embedding):
-        global vectorstore
-        RAW_KNOWLEDGE_BASE = []
-        #Embedding, Vector generation and storing:
-        self.embedding_model = HuggingFaceEmbeddings(
-            model_name=self.word_and_embed_model_path,
-            multi_process=True,
-            model_kwargs={"device": "cuda"},
-            encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
-        )
-        #index_cpu = faiss.IndexFlatL2(1024)
-        #res = faiss.StandardGpuResources()
-        #index_gpu = faiss.index_cpu_to_gpu(res, 0, index_cpu)
-        dirList = os.listdir(self.docs)
-        if (new_embedding==True):
-            for doc in dirList:
-                print(doc)
-                if (".md" in doc):
-                    ##doctxt = TextLoader(docs + "\\" + doc).load()
-                    file = open(self.docs + "\\" + doc, 'r', encoding='utf-8')
-                    doctxt = file.read()
-                    RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content=doctxt, metadata={"source": doc}))
-                    file.close()
-                if (".txt" in doc):
-                    file = open(self.docs + "\\" + doc, 'r', encoding='cp1252')
-                    doctxt = file.read()
-                    if doc.replace(".txt",".png") in dirList:
-                        RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content=doctxt, metadata={"source": doc.replace(".txt",".png")}))
-                    if doc.replace(".txt",".jpg") in dirList:
-                        RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content=doctxt, metadata={"source": doc.replace(".txt",".jpg")}))
-                    file.close()
-                    # RAW_KNOWLEDGE_BASE.append(txtLoader)
-                    # print(RAW_KNOWLEDGE_BASE)
-            # Chunking starts here
-            headers_to_split_on = [
-                ("#", "Header 1"),
-                ("##", "Header 2"),
-                ("###", "Header 3"),
-                ("####", "Header 4"),
-                ("#####", "Header 5"),
-            ]
-            markdown_splitter = MarkdownHeaderTextSplitter(
-                headers_to_split_on=headers_to_split_on,
-                strip_headers=True
-            )
-            tokenizer = AutoTokenizer.from_pretrained(self.word_and_embed_model_path)
-            text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
-                tokenizer=tokenizer,
-                chunk_size=512,  # The maximum number of words in a chunk
-                chunk_overlap=0,  # The number of characters to overlap between chunks
-                add_start_index=True,  # If `True`, includes chunk's start index in metadata
-                strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
-            )
-            ##Was macht man mit start Index herausfinden und wie metadata adden
-            docs_processed = []
-            for doc in RAW_KNOWLEDGE_BASE:
-                print(f"Word-Length in doc:{len(doc.page_content.split())}")
-                doc_cache = markdown_splitter.split_text(doc.page_content)
-                # print(f"Word-Length in doc_cache after MarkdownSplitter:{len(doc_cache.split())}")
-                doc_cache = text_splitter.split_documents(doc_cache)
-                # print(f"Word-Length in doc_cache after text_splitter:{len(doc_cache.split())}")
-                for chunk in doc_cache:
-                    chunk.metadata.update({"source": doc.metadata['source']})
-                    print(f"Chunk_Debug len: {len(chunk.page_content.split())} and Chunk:{chunk}")
-                # DEBUG:
-                # print(f"doc_cache after Metadata added:{doc_cache}\n")
-                docs_processed += doc_cache
-            #final_docs = []
-            #for doc in docs_processed:
-            #   final_docs += text_splitter.split_documents([doc])
-            #docs_processed = final_docs
-            ##Ab hier alt:
-            # MARKDOWN_SEPARATORS = [
-            #    "\n\n",
-            #    "---"
-            #    "\n",
-            #    " ",
-            #    ""
-            # ]
-            #text_splitter = RecursiveCharacterTextSplitter(
-            #    chunk_size=512,  # The maximum number of characters in a chunk
-            #    chunk_overlap=100,  # The number of characters to overlap between chunks
-            #    add_start_index=True,  # If `True`, includes chunk's start index in metadata
-            #    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
-            #    separators=MARKDOWN_SEPARATORS,
-            #)
-            #docs_processed = []
-            #for doc in RAW_KNOWLEDGE_BASE:
-            #    docs_processed += text_splitter.split_documents([doc])
-            print(f"Docs processed:{len(docs_processed)}")
-            # Max_Sequence_Length of e5 large instr = 512 Tokens
-            # Make sure the maximum length is below embedding size
-            lengths = [len(s.page_content) for s in docs_processed]
-            print(max(lengths))
-            #for l in docs_processed:
-            #    print(f"Char-Length:{len(l.page_content.split())}")
-            #    print(f"Tokenizer Length: {len(tokenizer.tokenize(l.page_content))}")
-            #if (max(lengths) > SentenceTransformer(self.word_and_embed_model_path).max_seq_length):
-            #    print(
-            #        f'Error: Fit chunking size into embedding model.. Chunk{max(lengths)} is bigger than {SentenceTransformer(self.word_and_embed_model_path).Max_Sequence_Length}')
-            start = time.time()
-            #docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(docs_processed)})
-            #index_to_docstore_id = {i: str(i) for i in range(len(docs_processed))}
-            vectorstore = FAISS.from_documents(docs_processed, self.embedding_model, distance_strategy=DistanceStrategy.COSINE)
-            #self.vectorstore = FAISS(
-            #    embedding_function=self.embedding_model,
-            #    index=index_gpu,
-            #    distance_strategy=DistanceStrategy.COSINE,
-            #    docstore=docstore,
-            #    index_to_docstore_id=index_to_docstore_id
-            #)
-            #self.vectorstore.from_documents(docs_processed, self.embedding_model)
-            #index_cpu = faiss.index_gpu_to_cpu(self.vectorstore.index)
-            #self.vectorstore.index = index_cpu
-            vectorstore.save_local(self.embedPath)
-            #self.vectorstore.index = index_gpu
-            end = time.time()
-            print("Saving Embeddings took", end-start, "seconds!")
-        else:
-            start = time.time()
-            vectorstore = FAISS.load_local(self.embedPath, self.embedding_model, allow_dangerous_deserialization=True)
-            #self.vectorstore.index = index_gpu
-            end = time.time()
-            print("Loading Embeddings took", end - start, "seconds!")
-    def retrieveSimiliarEmbedding(self, query):
-        global vectorstore
-        print("Retrieving Embeddings...")
-        start = time.time()
-        query = f"Instruct: Given a search query, retrieve the relevant passages that answer the query\nQuery:{query}"
-        #self.vectorstore.
-        #retrieved_chunks = self.vectorstore.similarity_search(query=query, k=20)
-        retrieved_chunks = vectorstore.similarity_search(query=query, k=20)
-        #finalchunks = []
-        #for chunk in retrieved_chunks:
-        #    if "---" not in chunk.page_content:
-        #        finalchunks.append(chunk)
-        #retrieved_chunks = finalchunks
-        end = time.time()
-        print("Retrieving Chunks with similiar embeddings took", end - start, "seconds!")
-        #print("\n==================================Top document==================================")
-        #print(retrieved_chunks[0].page_content)
-        #print(retrieved_chunks[1].page_content)
-        #print(retrieved_chunks[2].page_content)
-        #print("==================================Metadata==================================")
-        #print(retrieved_chunks[0].metadata)
-        #print(retrieved_chunks[1].metadata)
-        #print(retrieved_chunks[2].metadata)
-        print(f"printing first chunk to see whats inside: {retrieved_chunks[0]}")
-        return retrieved_chunks
-    def initializeLLM(self):
-        bnb_config = BitsAndBytesConfig(
-            load_in_8bit=True,
-            #bnb_8bit_use_double_quant=True,
-            #bnb_8bit_quant_type="nf4",
-            #bnb_8bit_compute_dtype=torch.bfloat16,
-        )
-        llm = AutoModelForCausalLM.from_pretrained(
-            self.llm_path, quantization_config=bnb_config
-        )
-        self.llmtokenizer = AutoTokenizer.from_pretrained(self.llm_path)
-        self.streamer=TextIteratorStreamer(self.llmtokenizer, skip_prompt=True)
-        self.llmpipeline = pipeline(
-            model=llm,
-            tokenizer=self.llmtokenizer,
-            task="text-generation",
-            do_sample=True,
-            temperature=0.7,
-            repetition_penalty=1.1,
-            return_full_text=False,
-            streamer=self.streamer,
-            max_new_tokens=500,
-        )
-    def queryLLM(self,query):
-        #resp = self.llmpipeline(chat) Fixen
-        return(self.llmpipeline(query)[0]["generated_text"])
-    def initializeRerankingModel(self):
-        self.rerankingModel = RAGPretrainedModel.from_pretrained(self.rerankModelPath)
-    @spaces.GPU
-    def ragPrompt(self, query, rerankingStep, history):
-        prompt_in_chat_format = [
-            {
-                "role": "system",
-                "content": """You are an helpful Chatbot for the BSI IT-Grundschutz. Using the information contained in the context,
-                give a comprehensive answer to the question.
-                Respond only to the question asked, response should be concise and relevant but also give some context to the question.
-                Provide the source document when relevant for the understanding.
-                If the answer cannot be deduced from the context, do not give an answer.""",
-            },
-            {
-                "role": "user",
-                "content": """Context:
-                {context}
-                ---
-                Chat-History:
-                {history}
-                ---
-                Now here is the question you need to answer.
-                Question: {question}""",
-            },
-        ]
-        RAG_PROMPT_TEMPLATE = self.llmtokenizer.apply_chat_template(
-            prompt_in_chat_format, tokenize=False, add_generation_prompt=True
-        )
-        retrieved_chunks = self.retrieveSimiliarEmbedding(query)
-        retrieved_chunks_text = []
-        #TODO Irgendwas stimmt hier mit den Listen nicht
-        for chunk in retrieved_chunks:
-            #TODO Hier noch was smarteres Überlegen für alle Header
-            if "Header 1" in chunk.metadata.keys():
-                retrieved_chunks_text.append(f"The Document is: '{chunk.metadata['source']}'\nHeader of the Section is: '{chunk.metadata['Header 1']}' and Content of it:{chunk.page_content}")
-            else:
-                retrieved_chunks_text.append(
-                    f"The Document is: '{chunk.metadata['source']}'\nImage Description is: ':{chunk.page_content}")
-        i=1
-        for chunk in retrieved_chunks_text:
-            print(f"Retrieved Chunk number {i}:\n{chunk}")
-            i=i+1
-        if rerankingStep==True:
-            if self.rerankingModel == None:
-                print ("initializing Reranker-Model..")
-                self.initializeRerankingModel()
-            print("Starting Reranking Chunks...")
-            self.rerankingModel
-            retrieved_chunks_text=self.rerankingModel.rerank(query, retrieved_chunks_text,k=5)
-            retrieved_chunks_text=[chunk["content"] for chunk in retrieved_chunks_text]
-            i = 1
-            for chunk in retrieved_chunks_text:
-                print(f"Reranked Chunk number {i}:\n{chunk}")
-                i = i + 1
-        context = "\nExtracted documents:\n"
-        context += "".join([doc for i, doc in enumerate(retrieved_chunks_text)])
-        #Alles außer letzte Useranfrage
-        final_prompt = RAG_PROMPT_TEMPLATE.format(
-            question=query, context=context, history=history[:-1]
-        )
-        print(f"Query:\n{final_prompt}")
-        pattern = r"Filename:(.*?);"
-        match = re.findall(pattern, final_prompt)
-        self.images=match
-        #queryModel = HuggingFacePipeline(pipeline = self.llmpipeline)
-        generation_thread = threading.Thread(target=self.llmpipeline, args=(final_prompt,))
-        generation_thread.start()
-        return self.streamer
-        #answer=self.queryLLM(final_prompt)
-        #answer = self.llmpipeline(final_prompt)
-        #for token in answer:
-        #    print (token["generated_text"])
-        #    yield token["generated_text"]
-        # gen = queryModel.stream(final_prompt)
-        #return gen
-        #print (f"Answer:\n{answer}")
-    def returnImages(self):
-        imageList = []
-        for image in self.images:
-            imageList.append(f"{self.docs}\\{image}")
-        return imageList
-    def launchGr(self):
-        gr.Interface.from_pipeline(self.llmpipeline).launch()
-if __name__ == '__main__':
-    #RAW_KNOWLEDGE_BASE = []
-    #RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content="1Text", metadata={"source": "bb"}))
-    #RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content="2Text", metadata={"source": "aa"}))
-    #RAW_KNOWLEDGE_BASE[0].metadata.update({"NeuerKey":"White"})
-    #print(RAW_KNOWLEDGE_BASE)
-    #time.sleep(10)
-    #{doc.page_content} [{doc.metadata}] => aktuellen Header in jeden Chunk embedden; Doc.Metadata retrieven
-    renewEmbeddings = False
-    reranking = True
-    bot = BSIChatbot()
-    bot.initializeEmbeddingModel(renewEmbeddings)
-    if reranking == True:
-        bot.initializeRerankingModel()
-    #TODO: DEBUG:
-    #bot.retrieveSimiliarEmbedding("Was ist der IT-Grundschutz?")
-    #TODO: DEBUG:
-    #time.sleep(10)
-    bot.initializeLLM()
-    #bot.retrieveSimiliarEmbedding("Welche Typen von Anforderungen gibt es im IT-Grundschutz?")
-    #bot.queryLLM("Welche Typen von Anforderungen gibt es im IT-Grundschutz?")
-    #bot.ragPrompt("""
-    #Welche Informationen beinhaltet die IT-Grundschutz-Methodik (BSI-Standard 200-2)? Wähle aus den folgenden Antwortmöglichkeiten (mehrere können richtig sein!):
-    #A: besonders schutzwürdigen Komponenten,
-    #B: methodische Hilfestellungen zur schrittweisen Einführung eines ISMS,
-    #C: wie die Informationssicherheit im laufenden Betrieb aufrechterhalten und kontinuierlich verbessert werden kann,
-    #D: effiziente Verfahren, um die allgemeinen Anforderungen des BSI-Standards 200-1 zu konkretisieren
-    #""", True)
-    #bot.launchGr()
-    with gr.Blocks() as demo:
-        with gr.Row() as row:
-                with gr.Column(scale=3):
-                    chatbot = gr.Chatbot(type="messages")
-                    msg = gr.Textbox()
-                    clear = gr.Button("Clear")
-                    reset = gr.Button("Reset")
-                with gr.Column(scale=1):  # Bildergalerie
-                    gallery = gr.Gallery(label="Bildergalerie",elem_id="gallery")
-        def user(user_message, history: list):
-            return "", history + [{"role": "user", "content": user_message}]
-        def returnImages():
-            # Hier holen wir uns die Bildpfade und wandeln sie in gr.Image-Objekte um
-            image_paths = bot.returnImages()
-            print(f"returning images: {image_paths}")
-            return image_paths
-        def gradiobot(history: list):
-            start = time.time()
-            print(f"ragQuery hist -1:{history[-1].get('content')}")
-            print(f"ragQuery hist 0:{history[0].get('content')}")
-            print(f"fullHistory: {history}" )
-            bot_response = bot.ragPrompt(history[-1].get('content'), reranking, history)
-            history.append({"role": "assistant", "content": ""})
-            image_gallery = returnImages()
-            for token in bot_response:
-                if "eot_id" in token:
-                    token = token.replace("<|eot_id|>","")
-                if token.startswith("-"):
-                    token = f"\n{token}"
-                if re.match(r"^[1-9]\.",token):
-                    token = f"\n{token}"
-                history[-1]['content'] += token
-                yield history, image_gallery
-            end = time.time()
-            print("End2End Query took", end - start, "seconds!")
-        def resetHistory():
-            return []
-        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
-            gradiobot, inputs=[chatbot], outputs=[chatbot, gallery]
-        )
-        clear.click(lambda: None, None, chatbot, queue=False)
-        reset.click(resetHistory, outputs=chatbot, queue=False)
-    demo.css = """
-        #gallery {
-            display: grid;
-            grid-template-columns: repeat(2, 1fr);
-            gap: 10px;
-            height: 400px;
-            overflow: auto;
-        }
-    """
-    demo.launch(allowed_paths=["/home/user/app/docs"])
-    #Answer: B, C und D => Korrekt!
-# See PyCharm help at https://www.jetbrains.com/help/pycharm/