import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline import gradio as gr import chromadb from langchain.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory from langchain_huggingface import HuggingFacePipeline # Download the model from HuggingFace model_name = "anakin87/zephyr-7b-alpha-sharded" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, quantization_config=bnb_config ) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.bos_token_id = 1 # Set beginning of sentence token id # Specify embedding model embedding_model_name = "sentence-transformers/all-mpnet-base-v2" model_kwargs = {"device": "cpu"} # Using CPU since GPU is not available embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name, model_kwargs=model_kwargs) # Load the documents (replace this with your document loading logic) documents = ["Sample document text 1", "Sample document text 2"] # Split the documents into small chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) all_splits = text_splitter.split_documents(documents) # Embed document chunks vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db") # Specify the retriever retriever = vectordb.as_retriever() # Build HuggingFace pipeline for using zephyr-7b-alpha pipeline = pipeline( "text-generation", model=model, tokenizer=tokenizer, use_cache=True, device_map="auto", max_length=2048, do_sample=True, top_k=5, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, ) # Specify the llm llm = HuggingFacePipeline(pipeline=pipeline) # Define the create_conversation function def create_conversation(query: str, chat_history: list) -> tuple: try: memory = ConversationBufferMemory( memory_key='chat_history', return_messages=False ) qa_chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=retriever, memory=memory, get_chat_history=lambda h: h, ) result = qa_chain({'question': query, 'chat_history': chat_history}) chat_history.append((query, result['answer'])) return '', chat_history except Exception as e: chat_history.append((query, e)) return '', chat_history # Define the Gradio UI with gr.Blocks() as demo: chatbot = gr.Chatbot(label='My Chatbot') msg = gr.Textbox() clear = gr.ClearButton([msg, chatbot]) msg.submit(create_conversation, [msg, chatbot], [msg, chatbot]) # Launch the Gradio demo demo.launch()