Spaces:
Runtime error
Runtime error
| from flask import Flask, request, Response | |
| import logging | |
| from llama_cpp import Llama | |
| import threading | |
| from huggingface_hub import snapshot_download, Repository | |
| import huggingface_hub | |
| import gc | |
| import os.path | |
| import csv | |
| from datetime import datetime | |
| SYSTEM_PROMPT = "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык." | |
| SYSTEM_TOKEN = 1788 | |
| USER_TOKEN = 1404 | |
| BOT_TOKEN = 9225 | |
| LINEBREAK_TOKEN = 13 | |
| ROLE_TOKENS = { | |
| "user": USER_TOKEN, | |
| "bot": BOT_TOKEN, | |
| "system": SYSTEM_TOKEN | |
| } | |
| CONTEXT_SIZE = 4000 | |
| ENABLE_GPU = True | |
| GPU_LAYERS = 70 | |
| # Create a lock object | |
| lock = threading.Lock() | |
| app = Flask(__name__) | |
| # Configure Flask logging | |
| app.logger.setLevel(logging.DEBUG) # Set the desired logging level | |
| # Initialize the model when the application starts | |
| #model_path = "../models/model-q4_K.gguf" # Replace with the actual model path | |
| #model_name = "model/ggml-model-q4_K.gguf" | |
| #repo_name = "IlyaGusev/saiga2_13b_gguf" | |
| #model_name = "model-q4_K.gguf" | |
| repo_name = "IlyaGusev/saiga2_70b_gguf" | |
| model_name = "ggml-model-q4_1.gguf" | |
| #repo_name = "IlyaGusev/saiga2_7b_gguf" | |
| #model_name = "model-q4_K.gguf" | |
| local_dir = '.' | |
| if os.path.isdir('/data'): | |
| app.logger.info('Persistent storage enabled') | |
| model = None | |
| model_path = snapshot_download(repo_id=repo_name, allow_patterns=model_name) + '/' + model_name | |
| app.logger.info('Model path: ' + model_path) | |
| DATASET_REPO_URL = "https://huggingface.co/datasets/muryshev/saiga-chat" | |
| DATA_FILENAME = "data.csv" | |
| DATA_FILE = os.path.join("dataset", DATA_FILENAME) | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| app.logger.info("hfh: "+huggingface_hub.__version__) | |
| repo = Repository( | |
| local_dir="dataset", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN | |
| ) | |
| def log(request: str = '', response: str = ''): | |
| if request or response: | |
| with open(DATA_FILE, "a") as csvfile: | |
| writer = csv.DictWriter(csvfile, fieldnames=["request", "response", "time"]) | |
| writer.writerow( | |
| {"request": request, "response": response, "time": str(datetime.now())} | |
| ) | |
| commit_url = repo.push_to_hub() | |
| app.logger.info(commit_url) | |
| def init_model(context_size, enable_gpu=False, gpu_layer_number=35): | |
| global model | |
| if model is not None: | |
| del model | |
| gc.collect() | |
| if enable_gpu: | |
| model = Llama( | |
| model_path=model_path, | |
| n_ctx=context_size, | |
| n_parts=1, | |
| #n_batch=100, | |
| logits_all=True, | |
| #n_threads=12, | |
| verbose=True, | |
| n_gpu_layers=gpu_layer_number, | |
| n_gqa=8 #must be set for 70b models | |
| ) | |
| return model | |
| else: | |
| model = Llama( | |
| model_path=model_path, | |
| n_ctx=context_size, | |
| n_parts=1, | |
| #n_batch=100, | |
| logits_all=True, | |
| #n_threads=12, | |
| verbose=True, | |
| n_gqa=8 #must be set for 70b models | |
| ) | |
| return model | |
| init_model(CONTEXT_SIZE, ENABLE_GPU, GPU_LAYERS) | |
| def get_message_tokens(model, role, content): | |
| message_tokens = model.tokenize(content.encode("utf-8")) | |
| message_tokens.insert(1, ROLE_TOKENS[role]) | |
| message_tokens.insert(2, LINEBREAK_TOKEN) | |
| message_tokens.append(model.token_eos()) | |
| return message_tokens | |
| def get_system_tokens(model): | |
| system_message = { | |
| "role": "system", | |
| "content": SYSTEM_PROMPT | |
| } | |
| return get_message_tokens(model, **system_message) | |
| def get_system_tokens_for_preprompt(model, preprompt): | |
| system_message = { | |
| "role": "system", | |
| "content": preprompt | |
| } | |
| return get_message_tokens(model, **system_message) | |
| #app.logger.info('Evaluating system tokens start') | |
| #system_tokens = get_system_tokens(model) | |
| #model.eval(system_tokens) | |
| #app.logger.info('Evaluating system tokens end') | |
| stop_generation = False | |
| def generate_tokens(model, generator): | |
| global stop_generation | |
| app.logger.info('generate_tokens started') | |
| with lock: | |
| try: | |
| for token in generator: | |
| if token == model.token_eos() or stop_generation: | |
| stop_generation = False | |
| app.logger.info('End generating') | |
| yield b'' # End of chunk | |
| break | |
| token_str = model.detokenize([token])#.decode("utf-8", errors="ignore") | |
| yield token_str | |
| except Exception as e: | |
| app.logger.info('generator exception') | |
| app.logger.info(e) | |
| yield b'' # End of chunk | |
| def handler_change_context_size(): | |
| global stop_generation, model | |
| stop_generation = True | |
| new_size = int(request.args.get('size', CONTEXT_SIZE)) | |
| init_model(new_size, ENABLE_GPU, GPU_LAYERS) | |
| return Response('Size changed', content_type='text/plain') | |
| def handler_stop_generation(): | |
| global stop_generation | |
| stop_generation = True | |
| return Response('Stopped', content_type='text/plain') | |
| def generate_unknown_response(): | |
| app.logger.info('unknown method: '+request.method) | |
| try: | |
| request_payload = request.get_json() | |
| app.logger.info('payload: '+request.get_json()) | |
| except Exception as e: | |
| app.logger.info('payload empty') | |
| return Response('What do you want?', content_type='text/plain') | |
| def generate_search_request(): | |
| global stop_generation | |
| stop_generation = True | |
| model.reset() | |
| data = request.get_json() | |
| app.logger.info(data) | |
| user_query = data.get("query", "") | |
| preprompt = data.get("preprompt", "") | |
| parameters = data.get("parameters", {}) | |
| # Extract parameters from the request | |
| temperature = parameters.get("temperature", 0.01) | |
| truncate = parameters.get("truncate", 1000) | |
| max_new_tokens = parameters.get("max_new_tokens", 1024) | |
| top_p = parameters.get("top_p", 0.85) | |
| repetition_penalty = parameters.get("repetition_penalty", 1.2) | |
| top_k = parameters.get("top_k", 30) | |
| return_full_text = parameters.get("return_full_text", False) | |
| tokens = get_system_tokens_for_preprompt(model, preprompt) | |
| tokens.append(LINEBREAK_TOKEN) | |
| tokens = get_message_tokens(model=model, role="user", content=user_query[:200]) + [model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN] | |
| stop_generation = False | |
| generator = model.generate( | |
| tokens, | |
| top_k=top_k, | |
| top_p=top_p, | |
| temp=temperature, | |
| repeat_penalty=repetition_penalty | |
| ) | |
| # Use Response to stream tokens | |
| return Response(generate_tokens(model, generator), content_type='text/plain', status=200, direct_passthrough=True) | |
| def generate_response(): | |
| global stop_generation | |
| stop_generation = True | |
| model.reset() | |
| data = request.get_json() | |
| app.logger.info(data) | |
| messages = data.get("messages", []) | |
| preprompt = data.get("preprompt", "") | |
| parameters = data.get("parameters", {}) | |
| # Extract parameters from the request | |
| temperature = parameters.get("temperature", 0.01) | |
| truncate = parameters.get("truncate", 1000) | |
| max_new_tokens = parameters.get("max_new_tokens", 1024) | |
| top_p = parameters.get("top_p", 0.85) | |
| repetition_penalty = parameters.get("repetition_penalty", 1.2) | |
| top_k = parameters.get("top_k", 30) | |
| return_full_text = parameters.get("return_full_text", False) | |
| tokens = get_system_tokens(model) | |
| tokens.append(LINEBREAK_TOKEN) | |
| tokens = [] | |
| for message in messages: | |
| if message.get("from") == "assistant": | |
| message_tokens = get_message_tokens(model=model, role="bot", content=message.get("content", "")) | |
| else: | |
| message_tokens = get_message_tokens(model=model, role="user", content=message.get("content", "")) | |
| tokens.extend(message_tokens) | |
| tokens.extend([model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN]) | |
| app.logger.info('Prompt:') | |
| user_request = model.detokenize(tokens[:CONTEXT_SIZE]).decode("utf-8", errors="ignore") | |
| app.logger.info(user_request) | |
| stop_generation = False | |
| app.logger.info('Generate started') | |
| generator = model.generate( | |
| tokens[:CONTEXT_SIZE], | |
| top_k=top_k, | |
| top_p=top_p, | |
| temp=temperature, | |
| repeat_penalty=repetition_penalty | |
| ) | |
| app.logger.info('Generator created') | |
| response_tokens = [] | |
| def generate_and_log_tokens(model, generator): | |
| for token in generate_tokens(model, generator): | |
| if token == model.token_eos(): # or (max_new_tokens is not None and i >= max_new_tokens): | |
| log(user_request, model.detokenize(response_tokens).decode("utf-8", errors="ignore")) | |
| break | |
| response_tokens.append(token) | |
| yield token | |
| # Use Response to stream tokens | |
| return Response(generate_and_log_tokens(model, generator), content_type='text/plain', status=200, direct_passthrough=True) | |
| if __name__ == "__main__": | |
| app.run(host="0.0.0.0", port=7860, debug=False, threaded=False) |