Spaces:

dkdaniz
/

katara

Build error

Daniel Marques commited on Oct 20, 2023

Commit

fe3defb

1 Parent(s): 416e7fd

feat: add websocket

Files changed (4) hide show

constants.py CHANGED Viewed

@@ -37,8 +37,8 @@ MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE  # int(CONTEXT_WINDOW_SIZE/4)
 #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
-N_GPU_LAYERS = 40  # Llama-2-70B has 83 layers
-N_BATCH = 1024
 ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
 # N_GPU_LAYERS = 20

 #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
+N_GPU_LAYERS = 100  # Llama-2-70B has 83 layers
+N_BATCH = CONTEXT_WINDOW_SIZE
 ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
 # N_GPU_LAYERS = 20

load_models.py CHANGED Viewed

@@ -58,8 +58,6 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
             "model_path": model_path,
             "n_ctx": CONTEXT_WINDOW_SIZE,
             "max_tokens": MAX_NEW_TOKENS,
-            "n_batch": MAX_NEW_TOKENS,
              # set this based on your GPU & CPU RAM
         }
         if device_type.lower() == "mps":

             "model_path": model_path,
             "n_ctx": CONTEXT_WINDOW_SIZE,
             "max_tokens": MAX_NEW_TOKENS,
              # set this based on your GPU & CPU RAM
         }
         if device_type.lower() == "mps":

main.py CHANGED Viewed

@@ -51,6 +51,7 @@ QA = RetrievalQA.from_chain_type(
     return_source_documents=SHOW_SOURCES,
     chain_type_kwargs={
         "prompt": prompt,
     },
 )

     return_source_documents=SHOW_SOURCES,
     chain_type_kwargs={
         "prompt": prompt,
+        "memory": memory
     },
 )

prompt_template_utils.py CHANGED Viewed

@@ -6,6 +6,11 @@ This seems to have significant impact on the output of the LLM.
 from langchain.memory import ConversationBufferMemory
 from langchain.prompts import PromptTemplate
 # this is specific to Llama-2.
@@ -84,7 +89,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
             )
             prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
-    memory = ConversationBufferMemory(input_key="question", memory_key="history")
     return (
         prompt,

 from langchain.memory import ConversationBufferMemory
 from langchain.prompts import PromptTemplate
+from langchain.memory.chat_message_histories import RedisChatMessageHistory
+message_history = RedisChatMessageHistory(
+    url="redis://localhost:6379/1", ttl=600, session_id="my-session"
+)
 # this is specific to Llama-2.
             )
             prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
+    memory = ConversationBufferMemory(input_key="question", memory_key="history", chat_memory=message_history)
     return (
         prompt,