Spaces:
Build error
Build error
Daniel Marques
commited on
Commit
·
fe3defb
1
Parent(s):
416e7fd
feat: add websocket
Browse files- constants.py +2 -2
- load_models.py +0 -2
- main.py +1 -0
- prompt_template_utils.py +6 -1
constants.py
CHANGED
|
@@ -37,8 +37,8 @@ MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4)
|
|
| 37 |
|
| 38 |
#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
|
| 39 |
|
| 40 |
-
N_GPU_LAYERS =
|
| 41 |
-
N_BATCH =
|
| 42 |
|
| 43 |
### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
|
| 44 |
# N_GPU_LAYERS = 20
|
|
|
|
| 37 |
|
| 38 |
#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
|
| 39 |
|
| 40 |
+
N_GPU_LAYERS = 100 # Llama-2-70B has 83 layers
|
| 41 |
+
N_BATCH = CONTEXT_WINDOW_SIZE
|
| 42 |
|
| 43 |
### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
|
| 44 |
# N_GPU_LAYERS = 20
|
load_models.py
CHANGED
|
@@ -58,8 +58,6 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
|
|
| 58 |
"model_path": model_path,
|
| 59 |
"n_ctx": CONTEXT_WINDOW_SIZE,
|
| 60 |
"max_tokens": MAX_NEW_TOKENS,
|
| 61 |
-
"n_batch": MAX_NEW_TOKENS,
|
| 62 |
-
|
| 63 |
# set this based on your GPU & CPU RAM
|
| 64 |
}
|
| 65 |
if device_type.lower() == "mps":
|
|
|
|
| 58 |
"model_path": model_path,
|
| 59 |
"n_ctx": CONTEXT_WINDOW_SIZE,
|
| 60 |
"max_tokens": MAX_NEW_TOKENS,
|
|
|
|
|
|
|
| 61 |
# set this based on your GPU & CPU RAM
|
| 62 |
}
|
| 63 |
if device_type.lower() == "mps":
|
main.py
CHANGED
|
@@ -51,6 +51,7 @@ QA = RetrievalQA.from_chain_type(
|
|
| 51 |
return_source_documents=SHOW_SOURCES,
|
| 52 |
chain_type_kwargs={
|
| 53 |
"prompt": prompt,
|
|
|
|
| 54 |
},
|
| 55 |
)
|
| 56 |
|
|
|
|
| 51 |
return_source_documents=SHOW_SOURCES,
|
| 52 |
chain_type_kwargs={
|
| 53 |
"prompt": prompt,
|
| 54 |
+
"memory": memory
|
| 55 |
},
|
| 56 |
)
|
| 57 |
|
prompt_template_utils.py
CHANGED
|
@@ -6,6 +6,11 @@ This seems to have significant impact on the output of the LLM.
|
|
| 6 |
|
| 7 |
from langchain.memory import ConversationBufferMemory
|
| 8 |
from langchain.prompts import PromptTemplate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# this is specific to Llama-2.
|
| 11 |
|
|
@@ -84,7 +89,7 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
|
|
| 84 |
)
|
| 85 |
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
|
| 86 |
|
| 87 |
-
memory = ConversationBufferMemory(input_key="question", memory_key="history")
|
| 88 |
|
| 89 |
return (
|
| 90 |
prompt,
|
|
|
|
| 6 |
|
| 7 |
from langchain.memory import ConversationBufferMemory
|
| 8 |
from langchain.prompts import PromptTemplate
|
| 9 |
+
from langchain.memory.chat_message_histories import RedisChatMessageHistory
|
| 10 |
+
|
| 11 |
+
message_history = RedisChatMessageHistory(
|
| 12 |
+
url="redis://localhost:6379/1", ttl=600, session_id="my-session"
|
| 13 |
+
)
|
| 14 |
|
| 15 |
# this is specific to Llama-2.
|
| 16 |
|
|
|
|
| 89 |
)
|
| 90 |
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
|
| 91 |
|
| 92 |
+
memory = ConversationBufferMemory(input_key="question", memory_key="history", chat_memory=message_history)
|
| 93 |
|
| 94 |
return (
|
| 95 |
prompt,
|