import os import gradio as gr from llama_cpp import Llama from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain.prompts import PromptTemplate class RAGInterface: def __init__(self): # Initialize embedding model self.embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) # Load vector store persist_directory = os.path.join(os.path.dirname(__file__), 'mydb') self.vectorstore = Chroma( persist_directory=persist_directory, embedding_function=self.embeddings ) # Initialize LLM self.llm = Llama.from_pretrained( repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF", filename="Llama-3.2-1B-Instruct-Q8_0.gguf", ) # Define RAG prompt template self.template = """Answer the question based only on the following context: {context} Question: {question} Answer the question in a clear way. If you cannot find the answer in the context, just say "I don't have enough information to answer this question." Make sure to: 1. Only use information from the provided context 2. If you're unsure, acknowledge it """ self.prompt = PromptTemplate.from_template(self.template) def respond(self, message, history, system_message, max_tokens, temperature): # Build messages list messages = [{"role": "system", "content": system_message}] for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) # Search vector store retriever = self.vectorstore.as_retriever(search_kwargs={"k": 5}) docs = retriever.get_relevant_documents(message) context = "\n\n".join([doc.page_content for doc in docs]) # Format prompt and add to messages final_prompt = self.prompt.format(context=context, question=message) messages.append({"role": "user", "content": final_prompt}) # Generate response response = self.llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, ) return response['choices'][0]['message']['content'] def create_interface(self): # Custom CSS for better styling custom_css = """ """ # Header HTML header_html = f"""