import os from huggingface_hub import hf_hub_download from langchain.llms import LlamaCpp from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory def load_llm(): """ Downloads the GGUF model for Arabic and loads it via llama-cpp. """ model_file = hf_hub_download( repo_id="mobeidat/c4ai-command-r7b-arabic-02-2025-Q4_K_M-GGUF", filename="c4ai-command-r7b-arabic-02-2025-q4_k_m.gguf", local_dir="./models", local_dir_use_symlinks=False ) # Pass an empty grammar file to bypass Jinja template parsing. llm = LlamaCpp( model_path=model_file, flash_attn=False, n_ctx=2048, n_batch=512, chat_format="chatml", grammar=None, streaming=True, grammar_path=None, # ensure this file exists and is empty use_jinja=False, rope_freq_base=10000.0, rope_freq_scale=1.0, use_mmap=True, last_n_tokens_size=64, echo=False, repeat_penalty=1.1, temperature=0.8, top_k=40, top_p=0.95, logprobs=None, callback_manager=None, custom_get_token_ids = None, lora_base = None, lora_path = None, max_tokens = 256, metadata= None, n_gpu_layers= None, n_threads= None, stop=[], suffix= None, tags = None, use_mlock=False, vocab_only=False, logits_all= False, callbacks=None, f16_kv=True, n_parts=-1, seed=-1, verbose=True, client=None, cache=None ) return llm def build_conversational_chain(vectorstore): """ Creates a ConversationalRetrievalChain using the local llama-cpp-based LLM and a ConversationBufferMemory for multi-turn Q&A. """ llm = load_llm() memory = ConversationBufferMemory( memory_key="chat_history", return_messages=True ) qa_chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}), memory=memory, verbose=True ) return qa_chain