llamaindex
/

vdr-2b-multi-v1

@@ -4,6 +4,7 @@ import os
 import math
 from io import BytesIO
 from typing import Any, Dict, List, Literal, Optional, Union
 import requests
 import torch
@@ -121,27 +122,142 @@ class Transformer(nn.Module):
         image_data = base64.b64decode(data)
         return Image.open(BytesIO(image_data))
-    def _process_input(self, texts: List[Union[str, Image.Image]]) -> tuple[List[str], List[Image.Image]]:
         processed_texts = []
         processed_images = []
         dummy_image = Image.new('RGB', (56, 56))
         for sample in texts:
             if isinstance(sample, str):
-                processed_texts.append(self.query_prompt % sample)
-                processed_images.append(dummy_image)
             elif isinstance(sample, Image.Image):
                 processed_texts.append(self.document_prompt)
                 processed_images.append(self._resize_image(sample))
         return processed_texts, processed_images
     def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-        cache_position = torch.arange(0, features['input_ids'].shape[0])
         inputs = self.model.prepare_inputs_for_generation(
             **features, cache_position=cache_position, use_cache=False
         )
         with torch.no_grad():
             output = self.model(
                 **inputs,
@@ -155,7 +271,7 @@ class Transformer(nn.Module):
         )
         return features
-    def tokenize(self, texts: List[Union[str, Image.Image]], padding: str = 'longest') -> Dict[str, torch.Tensor]:
         processed_texts, processed_images = self._process_input(texts)
         return self.processor(

 import math
 from io import BytesIO
 from typing import Any, Dict, List, Literal, Optional, Union
+from urllib.parse import urlparse
 import requests
 import torch
         image_data = base64.b64decode(data)
         return Image.open(BytesIO(image_data))
+    @staticmethod
+    def _is_valid_url(url: str) -> bool:
+        try:
+            result = urlparse(url)
+            # Check if scheme and netloc are present and scheme is http/https
+            return all([result.scheme in ('http', 'https'), result.netloc])
+        except Exception:
+            return False
+    @staticmethod
+    def _is_safe_path(path: str) -> bool:
+        try:
+            # Convert to absolute path and normalize
+            abs_path = os.path.abspath(os.path.normpath(path))
+            # Check if file exists and is a regular file (not a directory or special file)
+            return os.path.isfile(abs_path)
+        except Exception:
+            return False
+    @staticmethod
+    def _load_image_from_url(url: str) -> Image.Image:
+        try:
+            response = requests.get(
+                url,
+                stream=True,
+                timeout=10,  # Add timeout
+                headers={'User-Agent': 'Mozilla/5.0'}  # Add user agent
+            )
+            response.raise_for_status()
+            # Check content type
+            content_type = response.headers.get('content-type', '')
+            if not content_type.startswith('image/'):
+                raise ValueError(f"Invalid content type: {content_type}")
+            # Limit file size (e.g., 10MB)
+            content = BytesIO()
+            size = 0
+            max_size = 10 * 1024 * 1024  # 10MB
+            for chunk in response.iter_content(chunk_size=8192):
+                size += len(chunk)
+                if size > max_size:
+                    raise ValueError("File too large")
+                content.write(chunk)
+            content.seek(0)
+            return Image.open(content)
+        except Exception as e:
+            raise ValueError(f"Failed to load image from URL: {str(e)}")
+    @staticmethod
+    def _load_image_from_path(image_path: str) -> Image.Image:
+        try:
+            # Convert to absolute path and normalize
+            abs_path = os.path.abspath(os.path.normpath(image_path))
+            # Check file size before loading
+            file_size = os.path.getsize(abs_path)
+            max_size = 10 * 1024 * 1024  # 10MB
+            if file_size > max_size:
+                raise ValueError("File too large")
+            with Image.open(abs_path) as img:
+                # Make a copy to ensure file handle is closed
+                return img.copy()
+        except Exception as e:
+            raise ValueError(f"Failed to load image from path: {str(e)}")
+    @staticmethod
+    def _load_image_from_bytes(image_bytes: bytes) -> Image.Image:
+        try:
+            # Check size
+            if len(image_bytes) > 10 * 1024 * 1024:  # 10MB
+                raise ValueError("Image data too large")
+            return Image.open(BytesIO(image_bytes))
+        except Exception as e:
+            raise ValueError(f"Failed to load image from bytes: {str(e)}")
+    def _process_input(self, texts: List[Union[str, Image.Image, bytes]]) -> tuple[List[str], List[Image.Image]]:
         processed_texts = []
         processed_images = []
         dummy_image = Image.new('RGB', (56, 56))
         for sample in texts:
             if isinstance(sample, str):
+                # Check if the string is a valid URL
+                if self._is_valid_url(sample):
+                    try:
+                        img = self._load_image_from_url(sample)
+                        processed_texts.append(self.document_prompt)
+                        processed_images.append(self._resize_image(img))
+                    except Exception as e:
+                        # If URL loading fails, treat as regular text
+                        processed_texts.append(self.query_prompt % sample)
+                        processed_images.append(dummy_image)
+                # Check if the string is a valid file path
+                elif self._is_safe_path(sample):
+                    try:
+                        img = self._load_image_from_path(sample)
+                        processed_texts.append(self.document_prompt)
+                        processed_images.append(self._resize_image(img))
+                    except Exception as e:
+                        # If image loading fails, treat as regular text
+                        processed_texts.append(self.query_prompt % sample)
+                        processed_images.append(dummy_image)
+                else:
+                    # Regular text query
+                    processed_texts.append(self.query_prompt % sample)
+                    processed_images.append(dummy_image)
             elif isinstance(sample, Image.Image):
                 processed_texts.append(self.document_prompt)
                 processed_images.append(self._resize_image(sample))
+            elif isinstance(sample, bytes):
+                try:
+                    img = self._load_image_from_bytes(sample)
+                    processed_texts.append(self.document_prompt)
+                    processed_images.append(self._resize_image(img))
+                except Exception as e:
+                    # If bytes can't be converted to image, use dummy
+                    processed_texts.append(self.document_prompt)
+                    processed_images.append(dummy_image)
         return processed_texts, processed_images
     def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        cache_position = torch.arange(0, features['input_ids'].shape[1])
         inputs = self.model.prepare_inputs_for_generation(
             **features, cache_position=cache_position, use_cache=False
         )
+        # ensure inputs are on the same device as the model
+        device = next(self.model.parameters()).device
+        inputs = {k: v.to(device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
         with torch.no_grad():
             output = self.model(
                 **inputs,
         )
         return features
+    def tokenize(self, texts: List[Union[str, Image.Image, bytes]], padding: str = 'longest') -> Dict[str, torch.Tensor]:
         processed_texts, processed_images = self._process_input(texts)
         return self.processor(