from huggingface_hub import hf_hub_download from langchain.llms import LlamaCpp from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory def load_llm(): """ Downloads the GGUF model for Arabic and loads it via llama-cpp. """ model_file = hf_hub_download( repo_id="mobeidat/c4ai-command-r7b-arabic-02-2025-Q4_K_M-GGUF", filename="c4ai-command-r7b-arabic-02-2025-q4_k_m.gguf", local_dir="./models", local_dir_use_symlinks=False ) # Pass an empty grammar file to bypass Jinja template parsing. llm = LlamaCpp( model_path=model_file, flash_attn=False, n_ctx=2048, n_batch=512, chat_format="chatml", grammar="", streaming=True, grammar_path="empty.jinja" # ensure this file exists and is empty ) return llm def build_conversational_chain(vectorstore): """ Creates a ConversationalRetrievalChain using the local llama-cpp-based LLM and a ConversationBufferMemory for multi-turn Q&A. """ llm = load_llm() memory = ConversationBufferMemory( memory_key="chat_history", return_messages=True ) qa_chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}), memory=memory, verbose=True ) return qa_chain