MikeMann's picture
removing Thread
dbb4df0
raw
history blame
7.03 kB
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable CUDA initialization
os.environ["allow_dangerous_deserialization"] = "True"
import spaces
import asyncio
import sys
from typing import List, Dict
import torch
import gradio as gr
from langchain_community.docstore import InMemoryDocstore
from langchain_community.document_loaders import TextLoader
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.utils import DistanceStrategy
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from huggingface_hub import login
# Environment setup
HF_KEY = os.getenv('Gated_Repo')
embedding_path = "/home/user/app/docs/_embeddings/index.faiss"
login(token=HF_KEY)
# Verify embedding file path
if not os.path.exists(embedding_path):
print("Embedding file not found!")
class BSIChatbot:
def __init__(self, model_paths: Dict[str, str], docs_path: str):
self.embedding_model = None
self.llmpipeline = None
self.llmtokenizer = None
self.vectorstore = None
self.streamer = None
self.llm_path = model_paths['llm_path']
self.word_and_embed_model_path = model_paths['embed_model_path']
self.docs = docs_path
@spaces.GPU
async def initialize_embedding_model(self, rebuild_embeddings: bool):
raw_knowledge_base = []
# Initialize embedding model
self.embedding_model = HuggingFaceEmbeddings(
model_name=self.word_and_embed_model_path,
multi_process=True,
model_kwargs={"device": "cuda"},
encode_kwargs={"normalize_embeddings": True},
)
if rebuild_embeddings:
# Load documents
for doc in os.listdir(self.docs):
file_path = os.path.join(self.docs, doc)
if doc.endswith(".md") or doc.endswith(".txt"):
with open(file_path, 'r', encoding='utf-8' if doc.endswith(".md") else 'cp1252') as file:
content = file.read()
metadata = {"source": doc}
raw_knowledge_base.append(LangchainDocument(page_content=content, metadata=metadata))
# Split documents into chunks
tokenizer = AutoTokenizer.from_pretrained(self.word_and_embed_model_path)
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
tokenizer=tokenizer,
chunk_size=512,
chunk_overlap=0,
add_start_index=True,
strip_whitespace=True,
)
processed_docs = []
for doc in raw_knowledge_base:
chunks = text_splitter.split_documents([doc])
for chunk in chunks:
chunk.metadata.update({"source": doc.metadata['source']})
processed_docs.extend(chunks)
# Create and save vector store
self.vectorstore = FAISS.from_documents(processed_docs, self.embedding_model, distance_strategy=DistanceStrategy.COSINE)
self.vectorstore.save_local(os.path.join(self.docs, "_embeddings"))
else:
# Load existing vector store
self.vectorstore = FAISS.load_local(os.path.join(self.docs, "_embeddings"), self.embedding_model, allow_dangerous_deserialization=True)
@spaces.GPU
async def retrieve_similar_embedding(self, query: str):
if self.vectorstore is None:
self.vectorstore = FAISS.load_local(os.path.join(self.docs, "_embeddings"), self.embedding_model,
allow_dangerous_deserialization=True)
query = f"Instruct: Given a search query, retrieve the relevant passages that answer the query\nQuery:{query}"
return self.vectorstore.similarity_search(query=query, k=20)
@spaces.GPU
async def initialize_llm(self):
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
llm = AutoModelForCausalLM.from_pretrained(self.llm_path, quantization_config=bnb_config)
self.llmtokenizer = AutoTokenizer.from_pretrained(self.llm_path)
self.llmpipeline = pipeline(
model=llm,
tokenizer=self.llmtokenizer,
task="text-generation",
do_sample=True,
temperature=0.7,
repetition_penalty=1.1,
return_full_text=False,
max_new_tokens=500,
)
@spaces.GPU
async def rag_prompt(self, query: str, rerank: bool, history: List[Dict]):
retrieved_chunks = await self.retrieve_similar_embedding(query)
retrieved_texts = [f"{chunk.metadata['source']}:\n{chunk.page_content}" for chunk in retrieved_chunks]
context = "\n".join(retrieved_texts)
history_text = "\n".join([h['content'] for h in history])
final_prompt = f"""Context:
{context}
---
History:
{history_text}
---
Question: {query}"""
response = await self._generate_response_async(final_prompt)
return response
@spaces.GPU
async def _generate_response_async(self, final_prompt: str):
loop = asyncio.get_event_loop()
tokens = await loop.run_in_executor(None, self.llmpipeline, final_prompt)
for token in tokens:
yield token
def launch_interface(self):
with gr.Blocks() as demo:
chatbot = gr.Chatbot(type="messages")
msg = gr.Textbox()
clear = gr.Button("Clear")
reset = gr.Button("Reset")
def user_input(user_message, history):
return "", history + [{"role": "user", "content": user_message}]
async def bot_response(history):
response_generator = self.rag_prompt(history[-1]['content'], True, history)
history.append({"role": "assistant", "content": ""})
async for token in response_generator:
history[-1]['content'] += token
yield history
msg.submit(user_input, [msg, chatbot], [msg, chatbot]).then(bot_response, chatbot, chatbot)
clear.click(lambda: None, None, chatbot)
reset.click(lambda: [], outputs=chatbot)
demo.launch()
if __name__ == '__main__':
model_paths = {
'llm_path': 'meta-llama/Llama-3.2-3B-Instruct',
'embed_model_path': 'intfloat/multilingual-e5-large-instruct',
}
docs_path = '/home/user/app/docs'
bot = BSIChatbot(model_paths, docs_path)
asyncio.run(bot.initialize_embedding_model(rebuild_embeddings=False))
asyncio.run(bot.initialize_llm())
bot.launch_interface()