MikeMann commited on
Commit
2c91b42
·
1 Parent(s): e9f7097
Files changed (1) hide show
  1. app.py +0 -483
app.py DELETED
@@ -1,483 +0,0 @@
1
- import os
2
- os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable CUDA initialization
3
- os.environ["allow_dangerous_deserialization"] = "True"
4
- print(os.getcwd())
5
- embedding_path="/home/user/app/docs/_embeddings/index.faiss"
6
- print(f"Loading FAISS index from: {embedding_path}")
7
- if not os.path.exists(embedding_path):
8
- print("File not found!")
9
- HF_KEY=os.getenv('Gated_Repo')
10
-
11
- import spaces
12
- import time
13
- from typing import final
14
- import asyncio
15
-
16
- import torch
17
- import gradio as gr
18
- import threading
19
- import re
20
-
21
- from langchain.text_splitter import RecursiveCharacterTextSplitter
22
- from langchain_community.docstore import InMemoryDocstore
23
- from langchain_community.document_loaders import TextLoader
24
- from langchain.docstore.document import Document as LangchainDocument
25
- from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
26
- from langchain_core.indexing import index
27
- from langchain_core.vectorstores import VectorStore
28
- from llama_index.core.node_parser import TextSplitter
29
- from llama_index.legacy.vector_stores import FaissVectorStore
30
- from pycparser.ply.yacc import token
31
- from ragatouille import RAGPretrainedModel
32
-
33
- from langchain_text_splitters import MarkdownHeaderTextSplitter, CharacterTextSplitter
34
- from sentence_transformers import SentenceTransformer
35
- from sqlalchemy.testing.suite.test_reflection import metadata
36
- from sympy.solvers.diophantine.diophantine import length
37
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
38
- from transformers import pipeline
39
-
40
- #DEPR:from langchain.vectorstores import FAISS
41
- import faiss
42
- from langchain_community.vectorstores import FAISS
43
- #DEPR: from langchain_community.embeddings import HuggingFaceEmbeddings
44
- from langchain_huggingface import HuggingFaceEmbeddings
45
- from langchain_community.vectorstores.utils import DistanceStrategy
46
- from huggingface_hub import login
47
-
48
- # Press Umschalt+F10 to execute it or replace it with your code.
49
- # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
50
-
51
- login(token=HF_KEY)
52
-
53
- vectorstore=None
54
-
55
- class BSIChatbot:
56
- embedding_model = None
57
- llmpipeline = None
58
- llmtokenizer = None
59
- vectorstore = None
60
- rerankingModel = None
61
- streamer = None
62
- images = [None]
63
-
64
- # model_paths = {
65
- # 'llm_path': 'meta-llama/Llama-3.2-3B-Instruct',
66
- # 'embed_model_path': 'intfloat/multilingual-e5-large-instruct',
67
- # 'rerank_model_path': 'domci/ColBERTv2-mmarco-de-0.1'
68
- # }
69
-
70
- llm_path = "meta-llama/Llama-3.2-3B-Instruct"
71
- word_and_embed_model_path = "intfloat/multilingual-e5-large-instruct"
72
- docs = "/home/user/app/docs"
73
- #docs = "H:\\Uni\\Master\\Masterarbeit\\Masterarbeit\\daten\\_parsed_embed_test"
74
- rerankModelPath="AdrienB134/ColBERTv1.0-german-mmarcoDE"
75
- embedPath="/home/user/app/docs/_embeddings"
76
-
77
- def __init__(self):
78
- self.embedding_model = None
79
- #self.vectorstore: VectorStore = None
80
-
81
- def initializeEmbeddingModel(self, new_embedding):
82
- global vectorstore
83
- RAW_KNOWLEDGE_BASE = []
84
-
85
- #Embedding, Vector generation and storing:
86
- self.embedding_model = HuggingFaceEmbeddings(
87
- model_name=self.word_and_embed_model_path,
88
- multi_process=True,
89
- model_kwargs={"device": "cuda"},
90
- encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
91
- )
92
-
93
- #index_cpu = faiss.IndexFlatL2(1024)
94
- #res = faiss.StandardGpuResources()
95
- #index_gpu = faiss.index_cpu_to_gpu(res, 0, index_cpu)
96
- dirList = os.listdir(self.docs)
97
- if (new_embedding==True):
98
- for doc in dirList:
99
- print(doc)
100
- if (".md" in doc):
101
- ##doctxt = TextLoader(docs + "\\" + doc).load()
102
- file = open(self.docs + "\\" + doc, 'r', encoding='utf-8')
103
- doctxt = file.read()
104
- RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content=doctxt, metadata={"source": doc}))
105
- file.close()
106
- if (".txt" in doc):
107
- file = open(self.docs + "\\" + doc, 'r', encoding='cp1252')
108
- doctxt = file.read()
109
- if doc.replace(".txt",".png") in dirList:
110
- RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content=doctxt, metadata={"source": doc.replace(".txt",".png")}))
111
- if doc.replace(".txt",".jpg") in dirList:
112
- RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content=doctxt, metadata={"source": doc.replace(".txt",".jpg")}))
113
- file.close()
114
-
115
- # RAW_KNOWLEDGE_BASE.append(txtLoader)
116
- # print(RAW_KNOWLEDGE_BASE)
117
-
118
- # Chunking starts here
119
-
120
- headers_to_split_on = [
121
- ("#", "Header 1"),
122
- ("##", "Header 2"),
123
- ("###", "Header 3"),
124
- ("####", "Header 4"),
125
- ("#####", "Header 5"),
126
- ]
127
-
128
- markdown_splitter = MarkdownHeaderTextSplitter(
129
- headers_to_split_on=headers_to_split_on,
130
- strip_headers=True
131
- )
132
-
133
- tokenizer = AutoTokenizer.from_pretrained(self.word_and_embed_model_path)
134
-
135
- text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
136
- tokenizer=tokenizer,
137
- chunk_size=512, # The maximum number of words in a chunk
138
- chunk_overlap=0, # The number of characters to overlap between chunks
139
- add_start_index=True, # If `True`, includes chunk's start index in metadata
140
- strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
141
- )
142
-
143
- ##Was macht man mit start Index herausfinden und wie metadata adden
144
- docs_processed = []
145
- for doc in RAW_KNOWLEDGE_BASE:
146
- print(f"Word-Length in doc:{len(doc.page_content.split())}")
147
- doc_cache = markdown_splitter.split_text(doc.page_content)
148
- # print(f"Word-Length in doc_cache after MarkdownSplitter:{len(doc_cache.split())}")
149
- doc_cache = text_splitter.split_documents(doc_cache)
150
- # print(f"Word-Length in doc_cache after text_splitter:{len(doc_cache.split())}")
151
- for chunk in doc_cache:
152
- chunk.metadata.update({"source": doc.metadata['source']})
153
- print(f"Chunk_Debug len: {len(chunk.page_content.split())} and Chunk:{chunk}")
154
- # DEBUG:
155
- # print(f"doc_cache after Metadata added:{doc_cache}\n")
156
- docs_processed += doc_cache
157
-
158
- #final_docs = []
159
- #for doc in docs_processed:
160
- # final_docs += text_splitter.split_documents([doc])
161
-
162
- #docs_processed = final_docs
163
-
164
- ##Ab hier alt:
165
- # MARKDOWN_SEPARATORS = [
166
- # "\n\n",
167
- # "---"
168
- # "\n",
169
- # " ",
170
- # ""
171
- # ]
172
-
173
- #text_splitter = RecursiveCharacterTextSplitter(
174
- # chunk_size=512, # The maximum number of characters in a chunk
175
- # chunk_overlap=100, # The number of characters to overlap between chunks
176
- # add_start_index=True, # If `True`, includes chunk's start index in metadata
177
- # strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
178
- # separators=MARKDOWN_SEPARATORS,
179
- #)
180
-
181
- #docs_processed = []
182
- #for doc in RAW_KNOWLEDGE_BASE:
183
- # docs_processed += text_splitter.split_documents([doc])
184
-
185
- print(f"Docs processed:{len(docs_processed)}")
186
- # Max_Sequence_Length of e5 large instr = 512 Tokens
187
-
188
-
189
- # Make sure the maximum length is below embedding size
190
- lengths = [len(s.page_content) for s in docs_processed]
191
- print(max(lengths))
192
-
193
- #for l in docs_processed:
194
- # print(f"Char-Length:{len(l.page_content.split())}")
195
- # print(f"Tokenizer Length: {len(tokenizer.tokenize(l.page_content))}")
196
-
197
- #if (max(lengths) > SentenceTransformer(self.word_and_embed_model_path).max_seq_length):
198
- # print(
199
- # f'Error: Fit chunking size into embedding model.. Chunk{max(lengths)} is bigger than {SentenceTransformer(self.word_and_embed_model_path).Max_Sequence_Length}')
200
-
201
- start = time.time()
202
- #docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(docs_processed)})
203
- #index_to_docstore_id = {i: str(i) for i in range(len(docs_processed))}
204
- vectorstore = FAISS.from_documents(docs_processed, self.embedding_model, distance_strategy=DistanceStrategy.COSINE)
205
- #self.vectorstore = FAISS(
206
- # embedding_function=self.embedding_model,
207
- # index=index_gpu,
208
- # distance_strategy=DistanceStrategy.COSINE,
209
- # docstore=docstore,
210
- # index_to_docstore_id=index_to_docstore_id
211
- #)
212
- #self.vectorstore.from_documents(docs_processed, self.embedding_model)
213
- #index_cpu = faiss.index_gpu_to_cpu(self.vectorstore.index)
214
- #self.vectorstore.index = index_cpu
215
- vectorstore.save_local(self.embedPath)
216
- #self.vectorstore.index = index_gpu
217
- end = time.time()
218
- print("Saving Embeddings took", end-start, "seconds!")
219
- else:
220
- start = time.time()
221
- vectorstore = FAISS.load_local(self.embedPath, self.embedding_model, allow_dangerous_deserialization=True)
222
- #self.vectorstore.index = index_gpu
223
- end = time.time()
224
-
225
- print("Loading Embeddings took", end - start, "seconds!")
226
-
227
- def retrieveSimiliarEmbedding(self, query):
228
- global vectorstore
229
- print("Retrieving Embeddings...")
230
- start = time.time()
231
- query = f"Instruct: Given a search query, retrieve the relevant passages that answer the query\nQuery:{query}"
232
-
233
- #self.vectorstore.
234
- #retrieved_chunks = self.vectorstore.similarity_search(query=query, k=20)
235
- retrieved_chunks = vectorstore.similarity_search(query=query, k=20)
236
- #finalchunks = []
237
- #for chunk in retrieved_chunks:
238
- # if "---" not in chunk.page_content:
239
- # finalchunks.append(chunk)
240
- #retrieved_chunks = finalchunks
241
- end = time.time()
242
- print("Retrieving Chunks with similiar embeddings took", end - start, "seconds!")
243
- #print("\n==================================Top document==================================")
244
- #print(retrieved_chunks[0].page_content)
245
- #print(retrieved_chunks[1].page_content)
246
- #print(retrieved_chunks[2].page_content)
247
- #print("==================================Metadata==================================")
248
- #print(retrieved_chunks[0].metadata)
249
- #print(retrieved_chunks[1].metadata)
250
- #print(retrieved_chunks[2].metadata)
251
- print(f"printing first chunk to see whats inside: {retrieved_chunks[0]}")
252
- return retrieved_chunks
253
-
254
- def initializeLLM(self):
255
- bnb_config = BitsAndBytesConfig(
256
- load_in_8bit=True,
257
- #bnb_8bit_use_double_quant=True,
258
- #bnb_8bit_quant_type="nf4",
259
- #bnb_8bit_compute_dtype=torch.bfloat16,
260
- )
261
- llm = AutoModelForCausalLM.from_pretrained(
262
- self.llm_path, quantization_config=bnb_config
263
- )
264
- self.llmtokenizer = AutoTokenizer.from_pretrained(self.llm_path)
265
- self.streamer=TextIteratorStreamer(self.llmtokenizer, skip_prompt=True)
266
- self.llmpipeline = pipeline(
267
- model=llm,
268
- tokenizer=self.llmtokenizer,
269
- task="text-generation",
270
- do_sample=True,
271
- temperature=0.7,
272
- repetition_penalty=1.1,
273
- return_full_text=False,
274
- streamer=self.streamer,
275
- max_new_tokens=500,
276
- )
277
-
278
- def queryLLM(self,query):
279
- #resp = self.llmpipeline(chat) Fixen
280
- return(self.llmpipeline(query)[0]["generated_text"])
281
-
282
- def initializeRerankingModel(self):
283
- self.rerankingModel = RAGPretrainedModel.from_pretrained(self.rerankModelPath)
284
-
285
- @spaces.GPU
286
- def ragPrompt(self, query, rerankingStep, history):
287
- prompt_in_chat_format = [
288
- {
289
- "role": "system",
290
- "content": """You are an helpful Chatbot for the BSI IT-Grundschutz. Using the information contained in the context,
291
- give a comprehensive answer to the question.
292
- Respond only to the question asked, response should be concise and relevant but also give some context to the question.
293
- Provide the source document when relevant for the understanding.
294
- If the answer cannot be deduced from the context, do not give an answer.""",
295
- },
296
- {
297
- "role": "user",
298
- "content": """Context:
299
- {context}
300
- ---
301
- Chat-History:
302
- {history}
303
- ---
304
- Now here is the question you need to answer.
305
-
306
- Question: {question}""",
307
- },
308
- ]
309
- RAG_PROMPT_TEMPLATE = self.llmtokenizer.apply_chat_template(
310
- prompt_in_chat_format, tokenize=False, add_generation_prompt=True
311
- )
312
- retrieved_chunks = self.retrieveSimiliarEmbedding(query)
313
- retrieved_chunks_text = []
314
- #TODO Irgendwas stimmt hier mit den Listen nicht
315
- for chunk in retrieved_chunks:
316
- #TODO Hier noch was smarteres Überlegen für alle Header
317
- if "Header 1" in chunk.metadata.keys():
318
- retrieved_chunks_text.append(f"The Document is: '{chunk.metadata['source']}'\nHeader of the Section is: '{chunk.metadata['Header 1']}' and Content of it:{chunk.page_content}")
319
- else:
320
- retrieved_chunks_text.append(
321
- f"The Document is: '{chunk.metadata['source']}'\nImage Description is: ':{chunk.page_content}")
322
- i=1
323
- for chunk in retrieved_chunks_text:
324
- print(f"Retrieved Chunk number {i}:\n{chunk}")
325
- i=i+1
326
-
327
- if rerankingStep==True:
328
- if self.rerankingModel == None:
329
- print ("initializing Reranker-Model..")
330
- self.initializeRerankingModel()
331
- print("Starting Reranking Chunks...")
332
- self.rerankingModel
333
- retrieved_chunks_text=self.rerankingModel.rerank(query, retrieved_chunks_text,k=5)
334
- retrieved_chunks_text=[chunk["content"] for chunk in retrieved_chunks_text]
335
-
336
- i = 1
337
- for chunk in retrieved_chunks_text:
338
- print(f"Reranked Chunk number {i}:\n{chunk}")
339
- i = i + 1
340
-
341
- context = "\nExtracted documents:\n"
342
- context += "".join([doc for i, doc in enumerate(retrieved_chunks_text)])
343
- #Alles außer letzte Useranfrage
344
- final_prompt = RAG_PROMPT_TEMPLATE.format(
345
- question=query, context=context, history=history[:-1]
346
- )
347
-
348
- print(f"Query:\n{final_prompt}")
349
- pattern = r"Filename:(.*?);"
350
- match = re.findall(pattern, final_prompt)
351
- self.images=match
352
-
353
- #queryModel = HuggingFacePipeline(pipeline = self.llmpipeline)
354
- generation_thread = threading.Thread(target=self.llmpipeline, args=(final_prompt,))
355
- generation_thread.start()
356
-
357
- return self.streamer
358
-
359
- #answer=self.queryLLM(final_prompt)
360
- #answer = self.llmpipeline(final_prompt)
361
- #for token in answer:
362
- # print (token["generated_text"])
363
- # yield token["generated_text"]
364
- # gen = queryModel.stream(final_prompt)
365
-
366
-
367
- #return gen
368
-
369
- #print (f"Answer:\n{answer}")
370
-
371
- def returnImages(self):
372
- imageList = []
373
- for image in self.images:
374
- imageList.append(f"{self.docs}\\{image}")
375
- return imageList
376
-
377
- def launchGr(self):
378
- gr.Interface.from_pipeline(self.llmpipeline).launch()
379
-
380
-
381
-
382
- if __name__ == '__main__':
383
- #RAW_KNOWLEDGE_BASE = []
384
- #RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content="1Text", metadata={"source": "bb"}))
385
- #RAW_KNOWLEDGE_BASE.append(LangchainDocument(page_content="2Text", metadata={"source": "aa"}))
386
- #RAW_KNOWLEDGE_BASE[0].metadata.update({"NeuerKey":"White"})
387
- #print(RAW_KNOWLEDGE_BASE)
388
- #time.sleep(10)
389
-
390
- #{doc.page_content} [{doc.metadata}] => aktuellen Header in jeden Chunk embedden; Doc.Metadata retrieven
391
-
392
- renewEmbeddings = False
393
- reranking = True
394
- bot = BSIChatbot()
395
- bot.initializeEmbeddingModel(renewEmbeddings)
396
- if reranking == True:
397
- bot.initializeRerankingModel()
398
- #TODO: DEBUG:
399
- #bot.retrieveSimiliarEmbedding("Was ist der IT-Grundschutz?")
400
- #TODO: DEBUG:
401
- #time.sleep(10)
402
- bot.initializeLLM()
403
- #bot.retrieveSimiliarEmbedding("Welche Typen von Anforderungen gibt es im IT-Grundschutz?")
404
-
405
- #bot.queryLLM("Welche Typen von Anforderungen gibt es im IT-Grundschutz?")
406
-
407
- #bot.ragPrompt("""
408
- #Welche Informationen beinhaltet die IT-Grundschutz-Methodik (BSI-Standard 200-2)? Wähle aus den folgenden Antwortmöglichkeiten (mehrere können richtig sein!):
409
- #A: besonders schutzwürdigen Komponenten,
410
- #B: methodische Hilfestellungen zur schrittweisen Einführung eines ISMS,
411
- #C: wie die Informationssicherheit im laufenden Betrieb aufrechterhalten und kontinuierlich verbessert werden kann,
412
- #D: effiziente Verfahren, um die allgemeinen Anforderungen des BSI-Standards 200-1 zu konkretisieren
413
- #""", True)
414
-
415
- #bot.launchGr()
416
-
417
- with gr.Blocks() as demo:
418
- with gr.Row() as row:
419
- with gr.Column(scale=3):
420
- chatbot = gr.Chatbot(type="messages")
421
- msg = gr.Textbox()
422
- clear = gr.Button("Clear")
423
- reset = gr.Button("Reset")
424
- with gr.Column(scale=1): # Bildergalerie
425
- gallery = gr.Gallery(label="Bildergalerie",elem_id="gallery")
426
-
427
- def user(user_message, history: list):
428
- return "", history + [{"role": "user", "content": user_message}]
429
-
430
-
431
- def returnImages():
432
- # Hier holen wir uns die Bildpfade und wandeln sie in gr.Image-Objekte um
433
- image_paths = bot.returnImages()
434
- print(f"returning images: {image_paths}")
435
- return image_paths
436
-
437
- def gradiobot(history: list):
438
- start = time.time()
439
- print(f"ragQuery hist -1:{history[-1].get('content')}")
440
- print(f"ragQuery hist 0:{history[0].get('content')}")
441
- print(f"fullHistory: {history}" )
442
- bot_response = bot.ragPrompt(history[-1].get('content'), reranking, history)
443
- history.append({"role": "assistant", "content": ""})
444
-
445
- image_gallery = returnImages()
446
-
447
- for token in bot_response:
448
- if "eot_id" in token:
449
- token = token.replace("<|eot_id|>","")
450
- if token.startswith("-"):
451
- token = f"\n{token}"
452
- if re.match(r"^[1-9]\.",token):
453
- token = f"\n{token}"
454
-
455
- history[-1]['content'] += token
456
- yield history, image_gallery
457
- end = time.time()
458
- print("End2End Query took", end - start, "seconds!")
459
-
460
- def resetHistory():
461
- return []
462
-
463
- msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
464
- gradiobot, inputs=[chatbot], outputs=[chatbot, gallery]
465
- )
466
-
467
-
468
- clear.click(lambda: None, None, chatbot, queue=False)
469
- reset.click(resetHistory, outputs=chatbot, queue=False)
470
- demo.css = """
471
- #gallery {
472
- display: grid;
473
- grid-template-columns: repeat(2, 1fr);
474
- gap: 10px;
475
- height: 400px;
476
- overflow: auto;
477
- }
478
- """
479
- demo.launch(allowed_paths=["/home/user/app/docs"])
480
-
481
- #Answer: B, C und D => Korrekt!
482
-
483
- # See PyCharm help at https://www.jetbrains.com/help/pycharm/