import os os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable CUDA initialization os.environ["allow_dangerous_deserialization"] = "True" import spaces import asyncio import sys from typing import List, Dict import torch import gradio as gr from langchain_community.docstore import InMemoryDocstore from langchain_community.document_loaders import TextLoader from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings from langchain.docstore.document import Document as LangchainDocument from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores.utils import DistanceStrategy from sentence_transformers import SentenceTransformer from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline from huggingface_hub import login # Environment setup HF_KEY = os.getenv('Gated_Repo') embedding_path = "/home/user/app/docs/_embeddings/index.faiss" login(token=HF_KEY) # Verify embedding file path if not os.path.exists(embedding_path): print("Embedding file not found!") class BSIChatbot: def __init__(self, model_paths: Dict[str, str], docs_path: str): self.embedding_model = None self.llmpipeline = None self.llmtokenizer = None self.vectorstore = None self.streamer = None self.llm_path = model_paths['llm_path'] self.word_and_embed_model_path = model_paths['embed_model_path'] self.docs = docs_path @spaces.GPU async def initialize_embedding_model(self, rebuild_embeddings: bool): raw_knowledge_base = [] # Initialize embedding model self.embedding_model = HuggingFaceEmbeddings( model_name=self.word_and_embed_model_path, multi_process=True, model_kwargs={"device": "cuda"}, encode_kwargs={"normalize_embeddings": True}, ) if rebuild_embeddings: # Load documents for doc in os.listdir(self.docs): file_path = os.path.join(self.docs, doc) if doc.endswith(".md") or doc.endswith(".txt"): with open(file_path, 'r', encoding='utf-8' if doc.endswith(".md") else 'cp1252') as file: content = file.read() metadata = {"source": doc} raw_knowledge_base.append(LangchainDocument(page_content=content, metadata=metadata)) # Split documents into chunks tokenizer = AutoTokenizer.from_pretrained(self.word_and_embed_model_path) text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( tokenizer=tokenizer, chunk_size=512, chunk_overlap=0, add_start_index=True, strip_whitespace=True, ) processed_docs = [] for doc in raw_knowledge_base: chunks = text_splitter.split_documents([doc]) for chunk in chunks: chunk.metadata.update({"source": doc.metadata['source']}) processed_docs.extend(chunks) # Create and save vector store self.vectorstore = FAISS.from_documents(processed_docs, self.embedding_model, distance_strategy=DistanceStrategy.COSINE) self.vectorstore.save_local(os.path.join(self.docs, "_embeddings")) else: # Load existing vector store self.vectorstore = FAISS.load_local(os.path.join(self.docs, "_embeddings"), self.embedding_model, allow_dangerous_deserialization=True) @spaces.GPU async def retrieve_similar_embedding(self, query: str): if self.vectorstore is None: self.vectorstore = FAISS.load_local(os.path.join(self.docs, "_embeddings"), self.embedding_model, allow_dangerous_deserialization=True) query = f"Instruct: Given a search query, retrieve the relevant passages that answer the query\nQuery:{query}" return self.vectorstore.similarity_search(query=query, k=20) @spaces.GPU async def initialize_llm(self): bnb_config = BitsAndBytesConfig(load_in_8bit=True) llm = AutoModelForCausalLM.from_pretrained(self.llm_path, quantization_config=bnb_config) self.llmtokenizer = AutoTokenizer.from_pretrained(self.llm_path) self.llmpipeline = pipeline( model=llm, tokenizer=self.llmtokenizer, task="text-generation", do_sample=True, temperature=0.7, repetition_penalty=1.1, return_full_text=False, max_new_tokens=500, ) @spaces.GPU async def rag_prompt(self, query: str, rerank: bool, history: List[Dict]): retrieved_chunks = await self.retrieve_similar_embedding(query) retrieved_texts = [f"{chunk.metadata['source']}:\n{chunk.page_content}" for chunk in retrieved_chunks] context = "\n".join(retrieved_texts) history_text = "\n".join([h['content'] for h in history]) final_prompt = f"""Context: {context} --- History: {history_text} --- Question: {query}""" response = await self._generate_response_async(final_prompt) return response @spaces.GPU async def _generate_response_async(self, final_prompt: str): loop = asyncio.get_event_loop() tokens = await loop.run_in_executor(None, self.llmpipeline, final_prompt) for token in tokens: yield token def launch_interface(self): with gr.Blocks() as demo: chatbot = gr.Chatbot(type="messages") msg = gr.Textbox() clear = gr.Button("Clear") reset = gr.Button("Reset") def user_input(user_message, history): return "", history + [{"role": "user", "content": user_message}] async def bot_response(history): response_generator = self.rag_prompt(history[-1]['content'], True, history) history.append({"role": "assistant", "content": ""}) async for token in response_generator: history[-1]['content'] += token yield history msg.submit(user_input, [msg, chatbot], [msg, chatbot]).then(bot_response, chatbot, chatbot) clear.click(lambda: None, None, chatbot) reset.click(lambda: [], outputs=chatbot) demo.launch() if __name__ == '__main__': model_paths = { 'llm_path': 'meta-llama/Llama-3.2-3B-Instruct', 'embed_model_path': 'intfloat/multilingual-e5-large-instruct', } docs_path = '/home/user/app/docs' bot = BSIChatbot(model_paths, docs_path) asyncio.run(bot.initialize_embedding_model(rebuild_embeddings=False)) asyncio.run(bot.initialize_llm()) bot.launch_interface()