Spaces:
Build error
Build error
Daniel Marques
commited on
Commit
·
8fa0233
1
Parent(s):
2ea73cf
fix: add streamer
Browse files- load_models.py +7 -3
- main.py +2 -3
load_models.py
CHANGED
|
@@ -22,7 +22,7 @@ torch.set_grad_enabled(False)
|
|
| 22 |
from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
|
| 23 |
|
| 24 |
|
| 25 |
-
def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging):
|
| 26 |
"""
|
| 27 |
Load a GGUF/GGML quantized model using LlamaCpp.
|
| 28 |
|
|
@@ -56,13 +56,17 @@ def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, loggin
|
|
| 56 |
"model_path": model_path,
|
| 57 |
"n_ctx": CONTEXT_WINDOW_SIZE,
|
| 58 |
"max_tokens": MAX_NEW_TOKENS,
|
| 59 |
-
"n_batch": N_BATCH,
|
|
|
|
| 60 |
}
|
| 61 |
if device_type.lower() == "mps":
|
| 62 |
kwargs["n_gpu_layers"] = 1
|
| 63 |
if device_type.lower() == "cuda":
|
| 64 |
kwargs["n_gpu_layers"] = N_GPU_LAYERS # set this based on your GPU
|
| 65 |
|
|
|
|
|
|
|
|
|
|
| 66 |
return LlamaCpp(**kwargs)
|
| 67 |
except:
|
| 68 |
if "ggml" in model_basename:
|
|
@@ -185,7 +189,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging, stre
|
|
| 185 |
|
| 186 |
if model_basename is not None:
|
| 187 |
if ".gguf" in model_basename.lower():
|
| 188 |
-
llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
|
| 189 |
return llm
|
| 190 |
elif ".ggml" in model_basename.lower():
|
| 191 |
model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
|
|
|
|
| 22 |
from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
|
| 23 |
|
| 24 |
|
| 25 |
+
def load_quantized_model_gguf_ggml(model_id, model_basename, device_type, logging, stream = False):
|
| 26 |
"""
|
| 27 |
Load a GGUF/GGML quantized model using LlamaCpp.
|
| 28 |
|
|
|
|
| 56 |
"model_path": model_path,
|
| 57 |
"n_ctx": CONTEXT_WINDOW_SIZE,
|
| 58 |
"max_tokens": MAX_NEW_TOKENS,
|
| 59 |
+
"n_batch": N_BATCH,
|
| 60 |
+
# set this based on your GPU & CPU RAM
|
| 61 |
}
|
| 62 |
if device_type.lower() == "mps":
|
| 63 |
kwargs["n_gpu_layers"] = 1
|
| 64 |
if device_type.lower() == "cuda":
|
| 65 |
kwargs["n_gpu_layers"] = N_GPU_LAYERS # set this based on your GPU
|
| 66 |
|
| 67 |
+
#add stream
|
| 68 |
+
kwargs["stream"] = stream
|
| 69 |
+
|
| 70 |
return LlamaCpp(**kwargs)
|
| 71 |
except:
|
| 72 |
if "ggml" in model_basename:
|
|
|
|
| 189 |
|
| 190 |
if model_basename is not None:
|
| 191 |
if ".gguf" in model_basename.lower():
|
| 192 |
+
llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING, stream)
|
| 193 |
return llm
|
| 194 |
elif ".ggml" in model_basename.lower():
|
| 195 |
model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
|
main.py
CHANGED
|
@@ -42,7 +42,8 @@ DB = Chroma(
|
|
| 42 |
|
| 43 |
RETRIEVER = DB.as_retriever()
|
| 44 |
|
| 45 |
-
models = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME, stream=
|
|
|
|
| 46 |
LLM = models[0]
|
| 47 |
STREAMER = models[1]
|
| 48 |
|
|
@@ -164,8 +165,6 @@ async def predict(data: Predict):
|
|
| 164 |
global QA
|
| 165 |
user_prompt = data.prompt
|
| 166 |
if user_prompt:
|
| 167 |
-
# print(f'User Prompt: {user_prompt}')
|
| 168 |
-
# Get the answer from the chain
|
| 169 |
res = QA(user_prompt)
|
| 170 |
|
| 171 |
print(res)
|
|
|
|
| 42 |
|
| 43 |
RETRIEVER = DB.as_retriever()
|
| 44 |
|
| 45 |
+
models = load_model(device_type=DEVICE_TYPE, model_id=MODEL_ID, model_basename=MODEL_BASENAME, stream=True)
|
| 46 |
+
|
| 47 |
LLM = models[0]
|
| 48 |
STREAMER = models[1]
|
| 49 |
|
|
|
|
| 165 |
global QA
|
| 166 |
user_prompt = data.prompt
|
| 167 |
if user_prompt:
|
|
|
|
|
|
|
| 168 |
res = QA(user_prompt)
|
| 169 |
|
| 170 |
print(res)
|