Update app.py
Browse files
app.py
CHANGED
|
@@ -71,20 +71,20 @@ stop_generation = False
|
|
| 71 |
def generate_tokens(model, generator):
|
| 72 |
global stop_generation
|
| 73 |
app.logger.info('generate_tokens started')
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
|
| 89 |
@app.route('/stop_generation', methods=['GET'])
|
| 90 |
def handler_stop_generation():
|
|
@@ -133,7 +133,7 @@ def generate_search_request():
|
|
| 133 |
logits_all=True,
|
| 134 |
#n_threads=12,
|
| 135 |
verbose=True,
|
| 136 |
-
n_gpu_layers=
|
| 137 |
n_gqa=8 #must be set for 70b models
|
| 138 |
)
|
| 139 |
|
|
@@ -183,7 +183,7 @@ def generate_response():
|
|
| 183 |
logits_all=True,
|
| 184 |
#n_threads=12,
|
| 185 |
verbose=True,
|
| 186 |
-
n_gpu_layers=
|
| 187 |
n_gqa=8 #must be set for 70b models
|
| 188 |
)
|
| 189 |
|
|
@@ -239,4 +239,4 @@ def generate_response():
|
|
| 239 |
return Response(generate_tokens(model, generator), content_type='text/plain', status=200, direct_passthrough=True)
|
| 240 |
|
| 241 |
if __name__ == "__main__":
|
| 242 |
-
app.run(host="0.0.0.0", port=7860, debug=False, threaded=
|
|
|
|
| 71 |
def generate_tokens(model, generator):
|
| 72 |
global stop_generation
|
| 73 |
app.logger.info('generate_tokens started')
|
| 74 |
+
with lock:
|
| 75 |
+
try:
|
| 76 |
+
for token in generator:
|
| 77 |
+
if token == model.token_eos() or stop_generation:
|
| 78 |
+
stop_generation = False
|
| 79 |
+
app.logger.info('Abort generating')
|
| 80 |
+
yield b'' # End of chunk
|
| 81 |
+
break
|
| 82 |
+
|
| 83 |
+
token_str = model.detokenize([token])#.decode("utf-8", errors="ignore")
|
| 84 |
+
yield token_str
|
| 85 |
+
except Exception as e:
|
| 86 |
+
app.logger.info('generator exception')
|
| 87 |
+
yield b'' # End of chunk
|
| 88 |
|
| 89 |
@app.route('/stop_generation', methods=['GET'])
|
| 90 |
def handler_stop_generation():
|
|
|
|
| 133 |
logits_all=True,
|
| 134 |
#n_threads=12,
|
| 135 |
verbose=True,
|
| 136 |
+
n_gpu_layers=30,
|
| 137 |
n_gqa=8 #must be set for 70b models
|
| 138 |
)
|
| 139 |
|
|
|
|
| 183 |
logits_all=True,
|
| 184 |
#n_threads=12,
|
| 185 |
verbose=True,
|
| 186 |
+
n_gpu_layers=30,
|
| 187 |
n_gqa=8 #must be set for 70b models
|
| 188 |
)
|
| 189 |
|
|
|
|
| 239 |
return Response(generate_tokens(model, generator), content_type='text/plain', status=200, direct_passthrough=True)
|
| 240 |
|
| 241 |
if __name__ == "__main__":
|
| 242 |
+
app.run(host="0.0.0.0", port=7860, debug=False, threaded=False)
|