"""
https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
https://github.com/awinml/llama-cpp-python-bindings

python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat

python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/


./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128

./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128

./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv

"""

from simulator import Simulator
import llama_cpp
# import llama_cpp.llama_tokenizer
from transformers import AutoTokenizer


class Qwen2Simulator(Simulator):

    def __init__(self, model_name_or_path=None):
        self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Chat")
        self.llm = llama_cpp.Llama.from_pretrained(
            repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
            filename="*fp16.gguf",
            tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
            verbose=False,
        )

        ### local
        # self.hf_tokenizer = AutoTokenizer.from_pretrained("/workspace/xusong/huggingface/models/Qwen2-0.5B-Chat/")
        # self.llm = Llama(
        #     model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Chat-GGUF/qwen2-0_5b-chat-q8_0.gguf",
        #     # model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Chat/Qwen2-0.5B-Chat-F16.gguf",
        #     # n_gpu_layers=-1, # Uncomment to use GPU acceleration
        #     # seed=1337, # Uncomment to set a specific seed
        #     # n_ctx=2048, # Uncomment to increase the context window
        #     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
        #     verbose=False,
        # )


    def generate_query(self, messages):
        """
        :param messages:
        :return:
        """
        assert messages[-1]["role"] != "user"
        inputs = self.hf_tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
        )
        inputs = inputs + "<|im_start|>user\n"
        return self._generate(inputs)
        # for new_text in self._stream_generate(input_ids):
        #     yield new_text

    def generate_response(self, messages):
        assert messages[-1]["role"] == "user"
        inputs = self.hf_tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        return self._generate(inputs)
        # for new_text in self._stream_generate(input_ids):
        #     yield new_text


    def _generate(self, inputs):
        """
        qwen2-0.5b-chat 有bug：有时user生成结束没有<|im_end|>，示例：
            <|im_start|>system
            you are a helpful assistant<|im_end|>
            <|im_start|>user
            hi, what your name<|im_end|>
            <|im_start|>assistant
            My name is Jordan<|im_end|>
            <|im_start|>user              # 以上是输入，以下是生成
            how old are you?
            <|im_start|>assistant
            I am a 41-year-old man.<|im_end|>
        """
        # stream=False
        output = self.llm(
            inputs,
            max_tokens=20,
            temperature=5,
            stop=["<|im_end|>", "<|im_start|>"]
        )
        output_text = output["choices"][0]["text"]
        return output_text


bot = Qwen2Simulator()


if __name__ == "__main__":

    # messages = [
    #     {"role": "system", "content": "you are a helpful assistant"},
    #     {"role": "user", "content": "What is the capital of France?"}
    # ]
    # output = bot.generate_response(messages)
    # print(output)

    messages = [
        {"role": "system", "content": "you are a helpful assistant"},
        {"role": "user", "content": "hi, what your name"},
        {"role": "assistant", "content": "My name is Jordan"}
    ]
    output = bot.generate_query(messages)
    print(output)