from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
import onnx
import onnxruntime as ort
import numpy as np
import os
from tqdm import tqdm
from transformers import AutoConfig
from typing import List, Tuple
from axengine import InferenceSession
from ml_dtypes import bfloat16


device = "cuda" if torch.cuda.is_available() else "cpu"
embeddings = torch.load("./embeds/SmolVLMVisionEmbeddings.pkl", map_location=device, weights_only=False)
embeds = np.load(os.path.join("./smolvlm2_axmodel", "model.embed_tokens.weight.npy"))
# connector = torch.load("SmolVLMConnector.pkl", map_location=device, weights_only=False)
encoder = ort.InferenceSession(f'./vit_mdoel/vision_model.onnx', providers=["CPUExecutionProvider"])


def run_vision_model(
    pixel_values,
    patch_attention_mask=None,
):
    batch_size = pixel_values.size(0)
    if patch_attention_mask is None:
        patch_size = 16
        patch_attention_mask = torch.ones(
            (
                batch_size,
                pixel_values.size(2) // patch_size,
                pixel_values.size(3) // patch_size,
            )
        )
        patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)

    hidden_states = embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)

    patch_attention_mask = patch_attention_mask.view(batch_size, -1)
    # The call to `_upad_input` in `_flash_attention_forward` is expensive
    # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
    # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
    if not torch.any(~patch_attention_mask):
        patch_attention_mask = None
    elif not self._use_flash_attention_2:
        patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)

    encoder_outputs = encoder.run(None, {"input": hidden_states.detach().cpu().to(dtype=torch.float32).numpy()})[0]
    encoder_outputs = torch.from_numpy(encoder_outputs).to(device, dtype=hidden_states.dtype)

    return encoder_outputs


def get_image_features(pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
    """
    Encodes images into continuous embeddings that can be forwarded to the language model.

    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        pixel_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask indicating padded regions in the image.
    """
    batch_size, num_images, num_channels, height, width = pixel_values.shape
    pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])

    # Remove padding images - padding images are full 0.
    nb_values_per_image = pixel_values.shape[1:].numel()
    real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image

    if not any(real_images_inds):
        # no images, leave one empty image.
        real_images_inds[0] = True

    pixel_values = pixel_values[real_images_inds].contiguous()
    # Handle the vision attention mask
    if pixel_attention_mask is None:
        pixel_attention_mask = torch.ones(
            size=[pixel_values.shape[i] for i in (0, 2, 3)],
            dtype=torch.bool,
            device=pixel_values.device,
        )
    else:
        # Remove padding images from the mask
        pixel_attention_mask = pixel_attention_mask.view(batch_size * num_images, *pixel_attention_mask.shape[2:])
        pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
    patch_size = 16
    patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
    patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
    patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()

    # Get sequence from the vision encoder
    image_hidden_states = run_vision_model(pixel_values, patch_attention_mask)

    # Modality projection & resampling
    # image_hidden_states = connector(image_hidden_states) # 已经 fuse 到了 onnx 中
    return image_hidden_states


def inputs_merger(
        input_ids: torch.LongTensor, inputs_embeds: torch.Tensor, image_hidden_states: torch.Tensor
    ):
    """
    This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
    The merging happens as follows:
    - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
    - We get the image hidden states for the image through the vision encoder and that hidden state, after a pixel shuffle operation, is then projected into the text embedding space.
    We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
    - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
    - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
    """
    _, patch_size, _ = image_hidden_states.shape

    image_mask = input_ids == 49190 # self.image_token_id
    num_image_tokens = image_mask.sum(dim=1)
    if not torch.all(num_image_tokens % patch_size == 0):
        raise ValueError("At least one sample has <image> tokens not divisible by patch_size.")

    blocks_per_sample = num_image_tokens // patch_size

    offsets = torch.nn.functional.pad(blocks_per_sample.cumsum(dim=0), (1, 0), value=0)
    block_offset = offsets[:-1]
    row_cum = image_mask.cumsum(dim=-1)
    chunk_idx = (row_cum - 1) // patch_size
    local_idx = (row_cum - 1) % patch_size
    block_idx = block_offset.unsqueeze(1) + chunk_idx

    image_embeds = torch.zeros_like(inputs_embeds)
    image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]

    merged_embeds = torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
    return merged_embeds


def post_process(data, topk=1, topp=0.9, temperature=0.6):
    def top_p(l: np.ndarray, p: float) -> np.ndarray:
        index = np.argsort(l)
        res = l.copy()
        sum_p = 0
        for i in index[::-1]:
            if sum_p >= p:
                res[i] = 0
            sum_p += res[i]
        return res / sum_p

    def softmax(l: np.ndarray) -> np.ndarray:
        l_max = l - l.max()
        l_exp = np.exp(l_max)
        res = l_exp / np.sum(l_exp)
        return res.astype(np.float64)

    r = data.astype(np.float32)
    r = r.flatten()
    candidate_index = np.argpartition(r, -topk)[-topk:]
    candidate_value = r[candidate_index]
    candidate_value /= temperature
    candidate_soft = softmax(candidate_value)
    candidate_soft = top_p(candidate_soft, topp)
    candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
    pos = np.random.multinomial(1, candidate_soft).argmax()
    next_token = candidate_index[pos]
    return next_token, candidate_index, candidate_soft


if __name__ == "__main__":

    hf_model_path = "./smolvlm2_tokenizer/"
    axmodel_path = "./smolvlm2_axmodel"
    prompt = 'Can you describe this image?'

    processor = AutoProcessor.from_pretrained(hf_model_path)
    config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
    tokenizer = processor.tokenizer

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "url": "./assets/bee.jpg"},
                {"type": "text", "text": prompt},
            ]
        },
    ]

    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(device, dtype=torch.bfloat16)

    pixel_values = inputs["pixel_values"]
    pixel_attention_mask = inputs["pixel_attention_mask"]
    input_ids = inputs["input_ids"]
    input_ids_length = input_ids.shape[1]

    inputs_embeds = np.take(embeds, input_ids[0].cpu().numpy().tolist(), axis=0)[None, ...]
    inputs_embeds = torch.from_numpy(inputs_embeds).to(device, dtype=torch.bfloat16)

    """
    miniforge-pypy3/envs/lerobot/lib/python3.10/site-packages/transformers/models/smolvlm/modeling_smolvlm.py(681)get_image_features()
    """
    image_hidden_states = get_image_features(pixel_values, pixel_attention_mask)

    inputs_embeds = inputs_merger(
        input_ids=input_ids,
        inputs_embeds=inputs_embeds,
        image_hidden_states=image_hidden_states,
    ).to(dtype=torch.float32).cpu().numpy()

    prefill_data = inputs_embeds
    prefill_data = prefill_data.astype(bfloat16)
    token_ids = input_ids[0].cpu().numpy().tolist()
    token_len = len(token_ids)

    lastN = 2048
    cfg = config.text_config

    kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
    k_caches = [
        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
        for _ in range(cfg.num_hidden_layers)
    ]
    v_caches = [
        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
        for _ in range(cfg.num_hidden_layers)
    ]

    prefill_decoder_sessins = []
    for i in tqdm(range(cfg.num_hidden_layers), desc="Init InferenceSession"):
        session = InferenceSession(
            f"{axmodel_path}/llama_p1024_l{i}_together.axmodel"
        )
        prefill_decoder_sessins.append(session)
    post_process_session = InferenceSession(
        f"{axmodel_path}/llama_post.axmodel"
    )
    print("model load done!")

    """
        prefill
    """
    prefill_len = 1024

    if prefill_len > 0:
        indices = np.array(list(range(prefill_len)), np.uint32).reshape(
            (1, prefill_len)
        )
        indices[:, token_len:] = 0
        mask = np.zeros((1, prefill_len, prefill_len)) - 65536
        data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
        data[:, 0:token_len] = prefill_data
        for i, t in enumerate(token_ids):
            mask[:, i, : i + 1] = 0
        mask = mask.astype(bfloat16)
        for i in range(cfg.num_hidden_layers):
            input_feed = {
                "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
                "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
                "indices": indices,
                "input": data,
                "mask": mask,
            }
            outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=1)
            k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
            v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
            data[:, :token_len] = outputs[2][:, :token_len, :]

    post_out = post_process_session.run(None, {"input": data[:, token_len - 1, :][None, ...]})[0]
    next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
    posibles = [tokenizer.decode([t]) for t in posssible_tokens]
    posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
    token_ids.append(next_token)
    # print("prefill done!")
    print(f"input prompt: {prompt}\n")
    print("answer >>", tokenizer.decode(token_ids[token_len], skip_special_tokens=True), end='', flush=True)

    """
        decode
    """
    mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
    mask[:, :, :lastN] -= 65536
    mask[:, :, :token_len] = 0
    for start_indice in range(lastN + 1):
        if prefill_len > 0 and start_indice < token_len:
            continue
        next_token = token_ids[start_indice]
        indices = np.array([start_indice], np.uint32).reshape((1, 1))
        data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)

        for i in range(cfg.num_hidden_layers):
            input_feed = {
                "K_cache": k_caches[i],
                "V_cache": v_caches[i],
                "indices": indices,
                "input": data,
                "mask": mask,
            }
            outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=0)
            k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
            v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
            data = outputs[2]

        mask[..., start_indice] = 0
        if start_indice < token_len - 1:
            pass
        else:
            post_out = post_process_session.run(None, {"input": data})[0]
            next_token, posssible_tokens, possible_soft = post_process(post_out)
            token_ids.append(next_token)
            print(tokenizer.decode(next_token, skip_special_tokens=True), end='', flush=True)

        if next_token == tokenizer.eos_token_id:
            break
    print("\n")