# processing_opencua.py import torch from typing import List, Dict, Any, Union from PIL import Image from transformers.processing_utils import ProcessorMixin, BatchFeature from transformers import AutoTokenizer, AutoImageProcessor PLACEHOLDER = "<|media_placeholder|>" class OpenCUAProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer", "image_token_id", "merge_size"] def __init__(self, image_processor, tokenizer, image_token_id: int = 151664, merge_size: int = 2, **kwargs): self.image_processor = image_processor self.tokenizer = tokenizer self.image_token_id = image_token_id self.merge_size = getattr(image_processor, "merge_size", merge_size) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): trust = kwargs.get("trust_remote_code", True) # 优先用你仓库的 TikTokenV3;失败回退 AutoTokenizer(只用于初始化/占位) try: from tokenization_opencua import TikTokenV3 tok = TikTokenV3.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust) except Exception: tok = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust) imgproc = AutoImageProcessor.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust) return cls(imgproc, tok, **kwargs) def apply_chat_template(self, messages: List[Dict[str, Any]], **kwargs) -> Union[str, List[int]]: return self.tokenizer.apply_chat_template(messages, **kwargs) # 下面这些方法给 HF 路径用;vLLM 初始化只需要能成功 new 出来即可 def __call__(self, *args, **kwargs) -> BatchFeature: # 返回一个最小结构,避免被实际调用时崩溃 data = {"input_ids": torch.zeros(1, 1, dtype=torch.long)} return BatchFeature(data=data) # 提供给你自己脚本用的辅助(可选) def prepare_vllm_inputs(self, messages, images, add_generation_prompt=True): text = self.apply_chat_template(messages, tokenize=False, add_generation_prompt=add_generation_prompt) proc = self.image_processor(images=images, return_tensors="pt") grid = torch.as_tensor(proc["image_grid_thw"]) merge = getattr(self, "merge_size", 2) for thw in grid: num = int((thw[0] * thw[1] * thw[2]) // (merge ** 2)) text = text.replace(PLACEHOLDER, PLACEHOLDER * num, 1) return text, images # # processing_opencua.py # from transformers import Qwen2_5_VLProcessor, AutoTokenizer, AutoImageProcessor # class OpenCUAProcessor(Qwen2_5_VLProcessor): # # 用字符串就行,但我们会在 from_pretrained 里手动加载,避免字符串反射 # tokenizer_class = "TikTokenV3" # @classmethod # def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): # # 确保 remote code 可用 # trust_remote_code = kwargs.get("trust_remote_code", False) # # 1) 手动加载 tokenizer(会按模型目录里的 tokenizer_config.json -> TikTokenV3 + tokenization_opencua.py) # tokenizer = AutoTokenizer.from_pretrained( # pretrained_model_name_or_path, # trust_remote_code=trust_remote_code, # ) # # 2) 手动加载图像处理器(保持 Qwen2VLImageProcessor) # image_processor = AutoImageProcessor.from_pretrained( # pretrained_model_name_or_path, # trust_remote_code=trust_remote_code, # ) # # 3) 获取chat_template,如果tokenizer有的话 # chat_template = getattr(tokenizer, 'chat_template', None) # # 4) 构造并返回 Qwen2.5-VL 的 Processor 实例,传递chat_template # processor = cls(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template) # # 5) 添加vLLM需要的属性 # # 这些token ID需要与tokenizer_config.json中的定义一致 # processor.image_token = "<|media_placeholder|>" # 使用OpenCUA的媒体占位符 # processor.video_token = "<|media_placeholder|>" # 视频也使用相同的占位符 # # 添加token ID(从tokenizer_config.json中获取) # vocab = tokenizer.get_vocab() # processor.image_token_id = vocab.get("<|media_placeholder|>", 151664) # 默认151664 # processor.video_token_id = vocab.get("<|media_placeholder|>", 151664) # 视频使用相同ID # return processor