llamaindex
/

vdr-2b-multi-v1

@@ -9,7 +9,7 @@ import requests
 import torch
 from PIL import Image
 from torch import nn
-from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, AutoConfig
 class Transformer(nn.Module):
     save_in_root: bool = True
@@ -21,11 +21,9 @@ class Transformer(nn.Module):
         max_pixels: int = 768 * 28 * 28,
         min_pixels: int = 1 * 28 * 28,
         dimension: int = 2048,
         cache_dir: Optional[str] = None,
         device: str = 'cuda:0',
-        config_args: Optional[Dict[str, Any]] = None,
-        model_args: Optional[Dict[str, Any]] = None,
-        processor_args: Optional[Dict[str, Any]] = None,
         **kwargs,
     ) -> None:
         super(Transformer, self).__init__()
@@ -34,61 +32,55 @@ class Transformer(nn.Module):
         self.dimension = dimension
         self.max_pixels = max_pixels
         self.min_pixels = min_pixels
-        self.model_name_or_path = model_name_or_path
-        self.processor_name_or_path = processor_name_or_path or model_name_or_path
-        self.cache_dir = cache_dir
-        self.config_args = config_args or {}
-        self.model_args = model_args or {}
-        self.processor_args = processor_args or {}
-        self.document_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>"
-        self.query_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
-    @classmethod
-    def load(cls, input_path: str) -> 'Transformer':
-        config_path = os.path.join(input_path, 'config.json')
-        if os.path.exists(config_path):
-            with open(config_path) as f:
-                config = json.load(f)
-        else:
-            config = {}
-        instance = cls(model_name_or_path=input_path, **config)
-        # Load model with flash attention if available
         try:
-            instance.model = Qwen2VLForConditionalGeneration.from_pretrained(
-                input_path,
                 attn_implementation="flash_attention_2",
                 torch_dtype=torch.bfloat16,
-                device_map=instance.device,
-                cache_dir=instance.cache_dir,
-                **instance.model_args
             ).eval()
         except (ImportError, ValueError) as e:
             print(f"Flash attention not available, falling back to default attention: {e}")
-            instance.model = Qwen2VLForConditionalGeneration.from_pretrained(
-                input_path,
                 torch_dtype=torch.bfloat16,
-                device_map=instance.device,
-                cache_dir=instance.cache_dir,
-                **instance.model_args
             ).eval()
         # Initialize processor
-        instance.processor = AutoProcessor.from_pretrained(
-            input_path,
-            min_pixels=instance.min_pixels,
-            max_pixels=instance.max_pixels,
-            cache_dir=instance.cache_dir,
-            **instance.processor_args
         )
-        instance.model.padding_side = "left"
-        instance.processor.tokenizer.padding_side = "left"
-        return instance
     def _smart_resize(self, height: int, width: int) -> tuple[int, int]:
         h_bar = max(28, self._round_by_factor(height, 28))
@@ -132,21 +124,8 @@ class Transformer(nn.Module):
         for sample in texts:
             if isinstance(sample, str):
-                if sample.startswith('http') or sample.startswith('data:image/'):
-                    try:
-                        if sample.startswith('http'):
-                            response = requests.get(sample)
-                            image = Image.open(BytesIO(response.content)).convert('RGB')
-                        else:
-                            image = self._decode_data_image(sample).convert('RGB')
-                        processed_texts.append(self.document_prompt)
-                        processed_images.append(self._resize_image(image))
-                    except Exception as e:
-                        processed_texts.append(self.query_prompt % sample)
-                        processed_images.append(dummy_image)
-                else:
-                    processed_texts.append(self.query_prompt % sample)
-                    processed_images.append(dummy_image)
             elif isinstance(sample, Image.Image):
                 processed_texts.append(self.document_prompt)
                 processed_images.append(self._resize_image(sample))
@@ -186,21 +165,32 @@ class Transformer(nn.Module):
         return {k: v.to(self.device) for k, v in inputs.items()}
     def save(self, output_path: str, safe_serialization: bool = True) -> None:
         # Save the configuration
         config = {
-            'model_name_or_path': self.model_name_or_path,
-            'processor_name_or_path': self.processor_name_or_path,
             'max_pixels': self.max_pixels,
             'min_pixels': self.min_pixels,
             'dimension': self.dimension,
-            'config_args': self.config_args,
-            'model_args': self.model_args,
-            'processor_args': self.processor_args,
         }
-        os.makedirs(output_path, exist_ok=True)
-        with open(os.path.join(output_path, 'config.json'), 'w') as f:
             json.dump(config, f)
-        self.model.save_pretrained(output_path, safe_serialization=safe_serialization)
-        self.processor.save_pretrained(output_path)

 import torch
 from PIL import Image
 from torch import nn
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 class Transformer(nn.Module):
     save_in_root: bool = True
         max_pixels: int = 768 * 28 * 28,
         min_pixels: int = 1 * 28 * 28,
         dimension: int = 2048,
+        max_seq_length: Optional[int] = None,
         cache_dir: Optional[str] = None,
         device: str = 'cuda:0',
         **kwargs,
     ) -> None:
         super(Transformer, self).__init__()
         self.dimension = dimension
         self.max_pixels = max_pixels
         self.min_pixels = min_pixels
+        self.max_seq_length = max_seq_length
+        # Initialize model
         try:
+            self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_name_or_path,
                 attn_implementation="flash_attention_2",
                 torch_dtype=torch.bfloat16,
+                device_map=device,
+                cache_dir=cache_dir,
+                **kwargs
             ).eval()
         except (ImportError, ValueError) as e:
             print(f"Flash attention not available, falling back to default attention: {e}")
+            self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_name_or_path,
                 torch_dtype=torch.bfloat16,
+                device_map=device,
+                cache_dir=cache_dir,
+                **kwargs
             ).eval()
         # Initialize processor
+        self.processor = AutoProcessor.from_pretrained(
+            processor_name_or_path or model_name_or_path,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+            cache_dir=cache_dir
         )
+        # Set padding sides
+        self.model.padding_side = "left"
+        self.processor.tokenizer.padding_side = "left"
+        # Store prompts
+        self.document_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>"
+        self.query_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
+        # Try to infer max_seq_length if not provided
+        if self.max_seq_length is None:
+            if (
+                hasattr(self.model, 'config')
+                and hasattr(self.model.config, 'max_position_embeddings')
+                and hasattr(self.processor.tokenizer, 'model_max_length')
+            ):
+                self.max_seq_length = min(
+                    self.model.config.max_position_embeddings,
+                    self.processor.tokenizer.model_max_length,
+                )
     def _smart_resize(self, height: int, width: int) -> tuple[int, int]:
         h_bar = max(28, self._round_by_factor(height, 28))
         for sample in texts:
             if isinstance(sample, str):
+                processed_texts.append(self.query_prompt % sample)
+                processed_images.append(dummy_image)
             elif isinstance(sample, Image.Image):
                 processed_texts.append(self.document_prompt)
                 processed_images.append(self._resize_image(sample))
         return {k: v.to(self.device) for k, v in inputs.items()}
     def save(self, output_path: str, safe_serialization: bool = True) -> None:
+        """Save the model, tokenizer and processor to the given path."""
+        self.model.save_pretrained(output_path, safe_serialization=safe_serialization)
+        self.processor.save_pretrained(output_path)
         # Save the configuration
         config = {
+            'model_name_or_path': output_path,
             'max_pixels': self.max_pixels,
             'min_pixels': self.min_pixels,
             'dimension': self.dimension,
+            'max_seq_length': self.max_seq_length,
         }
+        config_path = os.path.join(output_path, 'sentence_bert_config.json')
+        with open(config_path, 'w') as f:
             json.dump(config, f)
+    @staticmethod
+    def load(input_path: str) -> 'Transformer':
+        """Load a saved model from the given path."""
+        # Load configuration
+        config_path = os.path.join(input_path, 'sentence_bert_config.json')
+        if os.path.exists(config_path):
+            with open(config_path) as f:
+                config = json.load(f)
+        else:
+            config = {'model_name_or_path': input_path}
+        return Transformer(**config)