Update app.py
Browse files
app.py
CHANGED
|
@@ -37,7 +37,17 @@ model_name = "ggml-model-q4_1.gguf"
|
|
| 37 |
|
| 38 |
snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
|
| 39 |
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
def get_message_tokens(model, role, content):
|
|
@@ -107,7 +117,7 @@ def generate_unknown_response():
|
|
| 107 |
def generate_search_request():
|
| 108 |
global stop_generation
|
| 109 |
stop_generation = False
|
| 110 |
-
|
| 111 |
|
| 112 |
|
| 113 |
data = request.get_json()
|
|
@@ -125,17 +135,7 @@ def generate_search_request():
|
|
| 125 |
top_k = 20
|
| 126 |
return_full_text = parameters.get("return_full_text", False)
|
| 127 |
|
| 128 |
-
|
| 129 |
-
model_path=model_name,
|
| 130 |
-
n_ctx=2000,
|
| 131 |
-
n_parts=1,
|
| 132 |
-
#n_batch=100,
|
| 133 |
-
logits_all=True,
|
| 134 |
-
#n_threads=12,
|
| 135 |
-
verbose=True,
|
| 136 |
-
n_gpu_layers=30,
|
| 137 |
-
n_gqa=8 #must be set for 70b models
|
| 138 |
-
)
|
| 139 |
|
| 140 |
tokens = get_system_tokens_for_preprompt(model, preprompt)
|
| 141 |
tokens.append(LINEBREAK_TOKEN)
|
|
@@ -157,7 +157,7 @@ def generate_search_request():
|
|
| 157 |
def generate_response():
|
| 158 |
global stop_generation
|
| 159 |
stop_generation = False
|
| 160 |
-
|
| 161 |
|
| 162 |
data = request.get_json()
|
| 163 |
app.logger.info(data)
|
|
@@ -175,18 +175,6 @@ def generate_response():
|
|
| 175 |
return_full_text = parameters.get("return_full_text", False)
|
| 176 |
|
| 177 |
|
| 178 |
-
model = Llama(
|
| 179 |
-
model_path=model_name,
|
| 180 |
-
n_ctx=2000,
|
| 181 |
-
n_parts=1,
|
| 182 |
-
#n_batch=100,
|
| 183 |
-
logits_all=True,
|
| 184 |
-
#n_threads=12,
|
| 185 |
-
verbose=True,
|
| 186 |
-
n_gpu_layers=30,
|
| 187 |
-
n_gqa=8 #must be set for 70b models
|
| 188 |
-
)
|
| 189 |
-
|
| 190 |
# Generate the response
|
| 191 |
#system_tokens = get_system_tokens(model)
|
| 192 |
#tokens = system_tokens
|
|
|
|
| 37 |
|
| 38 |
snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
|
| 39 |
|
| 40 |
+
model = Llama(
|
| 41 |
+
model_path=model_name,
|
| 42 |
+
n_ctx=2000,
|
| 43 |
+
n_parts=1,
|
| 44 |
+
#n_batch=100,
|
| 45 |
+
logits_all=True,
|
| 46 |
+
#n_threads=12,
|
| 47 |
+
verbose=True,
|
| 48 |
+
n_gpu_layers=35,
|
| 49 |
+
n_gqa=8 #must be set for 70b models
|
| 50 |
+
)
|
| 51 |
|
| 52 |
|
| 53 |
def get_message_tokens(model, role, content):
|
|
|
|
| 117 |
def generate_search_request():
|
| 118 |
global stop_generation
|
| 119 |
stop_generation = False
|
| 120 |
+
model.reset()
|
| 121 |
|
| 122 |
|
| 123 |
data = request.get_json()
|
|
|
|
| 135 |
top_k = 20
|
| 136 |
return_full_text = parameters.get("return_full_text", False)
|
| 137 |
|
| 138 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
tokens = get_system_tokens_for_preprompt(model, preprompt)
|
| 141 |
tokens.append(LINEBREAK_TOKEN)
|
|
|
|
| 157 |
def generate_response():
|
| 158 |
global stop_generation
|
| 159 |
stop_generation = False
|
| 160 |
+
model.reset()
|
| 161 |
|
| 162 |
data = request.get_json()
|
| 163 |
app.logger.info(data)
|
|
|
|
| 175 |
return_full_text = parameters.get("return_full_text", False)
|
| 176 |
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
# Generate the response
|
| 179 |
#system_tokens = get_system_tokens(model)
|
| 180 |
#tokens = system_tokens
|