import math from typing import Dict, List, Optional, Tuple, Union import PIL.Image import numpy as np import torch from flash_attn import flash_attn_varlen_func from flash_attn.layers.rotary import apply_rotary_emb from torch import Tensor, nn from torch.nn import functional as F from transformers import ( AutoConfig, AutoImageProcessor, AutoModel, AutoModelForCausalLM, AutoTokenizer, ) from transformers.activations import ACT2FN from transformers.generation.utils import GenerateOutput from transformers.modeling_outputs import BaseModelOutputWithNoAttention from transformers.modeling_utils import PreTrainedModel from .configuration_ovis2_5 import Siglip2NavitConfig, Ovis2_5_Config IMAGE_PLACEHOLDER = "" IMAGE_PLACEHOLDER_ID = -200 VIDEO_PLACEHOLDER = "