Spaces:

minh9972t12
/

yolocar

Sleeping

App Files Files Community

minh9972t12 commited on Sep 19

Commit

6e67e2f

verified ·

1 Parent(s): 52f96c3

Update src/comparison.py

Browse files

Files changed (1) hide show

src/comparison.py +177 -104

src/comparison.py CHANGED Viewed

@@ -1,55 +1,95 @@
 import numpy as np
-from typing import List, Dict, Optional
 from scipy.optimize import linear_sum_assignment
 import yaml
 import cv2
 import hashlib
 import torch
 from PIL import Image
-from transformers import SiglipModel, SiglipImageProcessor
-SIGLIP_AVAILABLE = False
 try:
-    SIGLIP_AVAILABLE = True
-    print("SigLIP 2 will be loaded when needed")
-except ImportError:
-    print("SigLIP 2 not available. Install: pip install transformers")
-    SIGLIP_AVAILABLE = False
-# Global SigLIP 2 model and processor
-_GLOBAL_SIGLIP_MODEL = None
-_GLOBAL_SIGLIP_PROCESSOR = None
-def get_siglip_model():
-    """Get or initialize global SigLIP 2 model (faster & more accurate than DINOv2)"""
-    global _GLOBAL_SIGLIP_MODEL, _GLOBAL_SIGLIP_PROCESSOR
-    if _GLOBAL_SIGLIP_MODEL is None and SIGLIP_AVAILABLE:
-        try:
-            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-            # Load SigLIP 2 base model (ViT-B/16, 86M params - balance speed/acc)
-            #          "google/siglip-large-patch16-384" for max acc (+3%, slower)
-            model_name = 'google/siglip-base-patch16-224'
-            _GLOBAL_SIGLIP_PROCESSOR = SiglipImageProcessor.from_pretrained(model_name)
-            _GLOBAL_SIGLIP_MODEL = SiglipModel.from_pretrained(model_name).to(device)
-            _GLOBAL_SIGLIP_MODEL.eval()
-            if torch.cuda.is_available():
-                _GLOBAL_SIGLIP_MODEL.half()
-            # Disable gradients
-            for param in _GLOBAL_SIGLIP_MODEL.parameters():
-                param.requires_grad = False
-            print(f"SigLIP 2 model loaded: {model_name} on {device}")
         except Exception as e:
-            print(f"SigLIP 2 loading failed: {e}")
-            _GLOBAL_SIGLIP_MODEL = None
-            _GLOBAL_SIGLIP_PROCESSOR = None
-    return _GLOBAL_SIGLIP_MODEL, _GLOBAL_SIGLIP_PROCESSOR
 class DamageComparator:
     """Enhanced damage comparator with DINOv2-based view-invariant re-identification"""
@@ -70,14 +110,14 @@ class DamageComparator:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # Get global DINOv2 model
-        self.siglip_model, self.siglip_processor = get_siglip_model()
-        # ReID thresholds (SigLIP 2 needs slightly lower threshold than DINOv2 due to better alignment)
-        self.reid_similarity_threshold = 0.65  # Tuned for SigLIP 2
         self.feature_cache = {}
-        # SigLIP 2 feature dimension
-        self.siglip_feature_dim = 768
     def calculate_iou(self, box1: List[int], box2: List[int]) -> float:
         """Calculate Intersection over Union between two boxes"""
@@ -99,48 +139,9 @@ class DamageComparator:
         return intersection / union
-    def _extract_siglip_features(self, roi: np.ndarray) -> np.ndarray:
-        """Extract SigLIP 2 vision features - faster & more accurate than DINOv2 for ReID"""
-        try:
-            model, processor = self.siglip_model, self.siglip_processor
-            # Convert BGR to RGB and resize (SigLIP uses 224x224)
-            roi_rgb = cv2.cvtColor(cv2.resize(roi, (224, 224)), cv2.COLOR_BGR2RGB)
-            roi_pil = Image.fromarray(roi_rgb)
-            # CRITICAL: no_grad + autocast for speed/memory
-            with torch.no_grad(), torch.autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
-                # Preprocess
-                inputs = processor(roi_pil, return_tensors="pt")
-                inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                # Forward pass: Use vision_model to get CLS token (fix for get_image_features error)
-                # SigLIPModel.vision_model returns last_hidden_state [1, seq_len, 768]
-                outputs = model.vision_model(**inputs)
-                features = outputs.last_hidden_state[:, 0, :]  # CLS token as global feature
-                # Move to CPU immediately to free VRAM
-                features = features.cpu().numpy().flatten()
-                # Clear CUDA cache
-                if self.device.type == 'cuda':
-                    torch.cuda.empty_cache()
-            # Optional: Combine with edge density (for texture robustness)
-            gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
-            edges = cv2.Canny(gray, 50, 150)
-            edge_density = np.sum(edges > 0) / edges.size
-            features = np.concatenate([features[:120], [edge_density * 100] * 8])  # Match DINO dim
-            return features[:128]  # Trim for consistency
-        except Exception as e:
-            print(f"SigLIP 2 feature extraction error: {e}")
-            return np.zeros(128)
     def extract_damage_features(self, image: np.ndarray, bbox: List[int]) -> np.ndarray:
         """
-        Extract view-invariant features for damage ReID using SigLIP 2
         Args:
             image: Full image (BGR format from OpenCV)
             bbox: [x1, y1, x2, y2] bounding box
@@ -160,10 +161,10 @@ class DamageComparator:
         features_list = []
-        # 1. SigLIP 2 features (if available) - Superior to DINOv2 for semantic ReID
-        if self.siglip_model is not None:
-            siglip_features = self._extract_siglip_features(damage_roi)
-            features_list.append(siglip_features)
         # 2. Geometric invariant features (always available)
         geometric_features = self._extract_geometric_features(damage_roi)
@@ -187,6 +188,71 @@ class DamageComparator:
         return combined_features
     def _extract_geometric_features(self, roi: np.ndarray) -> np.ndarray:
         """Extract geometric invariant features (Hu moments)"""
@@ -277,24 +343,24 @@ class DamageComparator:
         ]
         return np.array(features)
     def compute_match_score(self, box1, box2, reid_sim,
-                            alpha=0.6, beta=0.3, gamma=0.1):
         """
         Weighted score combining ReID, IoU, and position
         alpha, beta, gamma = weights
         """
         # IoU
         iou = self.calculate_iou(box1, box2)
         # Tính khoảng cách tâm
         cx1, cy1 = (box1[0] + box1[2]) / 2, (box1[1] + box1[3]) / 2
         cx2, cy2 = (box2[0] + box2[2]) / 2, (box2[1] + box2[3]) / 2
         dist = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5
         # Convert distance → score (0 → 1)
         pos_score = max(0, 1 - dist / self.position_tolerance)
         # Weighted score
         return alpha * reid_sim + beta * iou + gamma * pos_score
@@ -314,8 +380,10 @@ class DamageComparator:
         boxes1 = detections1['boxes']
         boxes2 = detections2['boxes']
         print(f"   Boxes1: {len(boxes1)}, Boxes2: {len(boxes2)}")
         print(f"   Images provided: {image1 is not None and image2 is not None}")
         if len(boxes1) == 0 and len(boxes2) == 0:
             return {
@@ -356,7 +424,7 @@ class DamageComparator:
         if image1 is not None and image2 is not None:
             reid_matrix = np.zeros((len(boxes1), len(boxes2)))
-            print("   Extracting  features for damage matching...")
             # Extract features for all boxes
             features1 = [self.extract_damage_features(image1, box) for box in boxes1]
@@ -367,6 +435,7 @@ class DamageComparator:
                 for j, feat2 in enumerate(features2):
                     reid_matrix[i, j] = np.dot(feat1, feat2)  # Already normalized
         if reid_matrix is not None:
             combined_matrix = np.zeros_like(reid_matrix)
@@ -379,6 +448,7 @@ class DamageComparator:
         else:
             combined_matrix = iou_matrix
         # Hungarian algorithm for optimal matching
         cost_matrix = 1 - combined_matrix
         row_indices, col_indices = linear_sum_assignment(cost_matrix)
@@ -393,25 +463,27 @@ class DamageComparator:
         # Dòng ~560 trong comparison.py
         # Trong match_damages_with_reid, dòng ~560
         for i, j in zip(row_indices, col_indices):
             score = combined_matrix[i, j]
             iou_score = iou_matrix[i, j]
-            # Logic mới:
-            # - Nếu IoU > 0.1 (có overlap đáng kể) -> dùng combined score bình thường
-            # - Nếu IoU < 0.1 (ít/không overlap) -> yêu cầu ReID score rất cao
-            # Sau dòng 567
-            if iou_score > 0.1:
-                threshold_to_use = self.combined_score_threshold
             else:
-                threshold_to_use = 0.7
             print(f"   Pair ({i},{j}): IoU={iou_score:.3f}, Score={score:.3f}, Threshold={threshold_to_use:.3f}")
-            print(f"   Classes: '{detections1['classes'][i]}' vs '{detections2['classes'][j]}'")
-            print(
-                f"   Will match: {score >= threshold_to_use and detections1['classes'][i] == detections2['classes'][j]}")
             if score >= threshold_to_use:
                 if detections1['classes'][i] == detections2['classes'][j]:
                     matched_pairs.append((i, j, score))
@@ -499,6 +571,7 @@ class DamageComparator:
                 'repaired': len(existing_damages),
                 'new': len(new_damages),
                 'using_reid': bool(before_image is not None and after_image is not None),
             }
         }
@@ -542,7 +615,7 @@ class DamageComparator:
                                             detections_list: List[Dict],
                                             images_list: List[np.ndarray]) -> Dict:
         """
-        Deduplicate damages across multiple views using SIGLIP2 features
         Args:
             detections_list: List of detections from different views
             images_list: List of corresponding images
@@ -551,7 +624,7 @@ class DamageComparator:
         """
         all_damages = []
-        print(f"Deduplicating damages across {len(images_list)} ...")
         # Collect all damages with their features
         for view_idx, (detections, image) in enumerate(zip(detections_list, images_list)):

 import numpy as np
+from typing import List, Dict, Tuple, Optional
 from scipy.optimize import linear_sum_assignment
 import yaml
 import cv2
 import hashlib
 import torch
+import torch.nn as nn
 from PIL import Image
+# DINOv2 availability check
+DINOV2_AVAILABLE = False
 try:
+    # Try loading DINOv2 from torch.hub first
+    torch.hub._validate_not_a_forked_repo = lambda a, b, c: True  # Allow loading from hub
+    DINOV2_AVAILABLE = True
+    print("✓ DINOv2 will be loaded when needed")
+except Exception as e:
+    print(f"⚠ DINOv2 preparation failed: {e}")
+    # Fallback: Try transformers library
+    try:
+        from transformers import AutoImageProcessor, AutoModel
+        DINOV2_AVAILABLE = True
+        print("✓ DINOv2 available via transformers library")
+    except ImportError:
+        print("DINOv2 not available. Using traditional features only.")
+        print("  Install with: pip install transformers torch")
+# Global DINOv2 model and processor
+_GLOBAL_DINOV2_MODEL = None
+_GLOBAL_DINOV2_PROCESSOR = None
+_GLOBAL_DINOV2_TRANSFORM = None
+def get_dinov2_model():
+    """Get or initialize global DINOv2 model"""
+    global _GLOBAL_DINOV2_MODEL, _GLOBAL_DINOV2_PROCESSOR, _GLOBAL_DINOV2_TRANSFORM
+    if _GLOBAL_DINOV2_MODEL is None and DINOV2_AVAILABLE:
+        try:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            # Try torch.hub first (preferred method)
+            try:
+                # Load DINOv2 small model (you can change to vitb14, vitl14, or vitg14 for larger models)
+                # vits14 = small (85M params), vitb14 = base (307M), vitl14 = large (1B), vitg14 = giant (1.8B)
+                model_name = 'dinov2_vits14'  # Using small model for speed
+                _GLOBAL_DINOV2_MODEL = torch.hub.load('facebookresearch/dinov2', model_name)
+                _GLOBAL_DINOV2_MODEL.to(device)
+                _GLOBAL_DINOV2_MODEL.eval()
+                # Disable gradient computation for inference
+                for param in _GLOBAL_DINOV2_MODEL.parameters():
+                    param.requires_grad = False
+                # Create transform for DINOv2
+                import torchvision.transforms as T
+                _GLOBAL_DINOV2_TRANSFORM = T.Compose([
+                    T.Resize(256, interpolation=T.InterpolationMode.BICUBIC),
+                    T.CenterCrop(224),
+                    T.ToTensor(),
+                    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+                ])
+                print(f"✓ DINOv2 model loaded: {model_name} via torch.hub")
+            except Exception as hub_error:
+                print(f"Torch.hub failed, trying transformers: {hub_error}")
+                # Fallback to transformers library
+                from transformers import AutoImageProcessor, AutoModel
+                model_name = "facebook/dinov2-small"  # Or dinov2-base, dinov2-large, dinov2-giant
+                _GLOBAL_DINOV2_PROCESSOR = AutoImageProcessor.from_pretrained(model_name)
+                _GLOBAL_DINOV2_MODEL = AutoModel.from_pretrained(model_name).to(device)
+                _GLOBAL_DINOV2_MODEL.eval()
+                for param in _GLOBAL_DINOV2_MODEL.parameters():
+                    param.requires_grad = False
+                print(f"✓ DINOv2 model loaded: {model_name} via transformers")
         except Exception as e:
+            print(f"⚠ DINOv2 loading failed: {e}. Using fallback features.")
+            _GLOBAL_DINOV2_MODEL = None
+            _GLOBAL_DINOV2_PROCESSOR = None
+            _GLOBAL_DINOV2_TRANSFORM = None
+    return _GLOBAL_DINOV2_MODEL, _GLOBAL_DINOV2_PROCESSOR, _GLOBAL_DINOV2_TRANSFORM
 class DamageComparator:
     """Enhanced damage comparator with DINOv2-based view-invariant re-identification"""
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # Get global DINOv2 model
+        self.dinov2_model, self.dinov2_processor, self.dinov2_transform = get_dinov2_model()
+        # ReID thresholds (DINOv2 typically needs different thresholds than CLIP)
+        self.reid_similarity_threshold = 0.7  # Slightly higher for DINOv2
         self.feature_cache = {}
+        # DINOv2 feature dimension (depends on model size)
+        self.dinov2_feature_dim = 384 if 'vits' in str(self.dinov2_model.__class__) else 768
     def calculate_iou(self, box1: List[int], box2: List[int]) -> float:
         """Calculate Intersection over Union between two boxes"""
         return intersection / union
     def extract_damage_features(self, image: np.ndarray, bbox: List[int]) -> np.ndarray:
         """
+        Extract view-invariant features for damage ReID using DINOv2
         Args:
             image: Full image (BGR format from OpenCV)
             bbox: [x1, y1, x2, y2] bounding box
         features_list = []
+        # 1. DINOv2 features (if available) - Most powerful for ReID
+        if self.dinov2_model is not None:
+            dinov2_features = self._extract_dinov2_features(damage_roi)
+            features_list.append(dinov2_features)
         # 2. Geometric invariant features (always available)
         geometric_features = self._extract_geometric_features(damage_roi)
         return combined_features
+    def _extract_dinov2_features(self, roi: np.ndarray) -> np.ndarray:
+        """Extract DINOv2 vision features - superior to CLIP for visual tasks"""
+        try:
+            # Convert BGR to RGB
+            roi_rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
+            # Convert to PIL Image
+            roi_pil = Image.fromarray(roi_rgb)
+            # CRITICAL: Always use no_grad context to save memory
+            with torch.no_grad():
+                if self.dinov2_transform is not None:
+                    # Using torch.hub version
+                    # Apply transforms
+                    roi_tensor = self.dinov2_transform(roi_pil).unsqueeze(0).to(self.device)
+                    # Extract features
+                    features = self.dinov2_model(roi_tensor)
+                    # DINOv2 returns [batch_size, num_patches+1, feature_dim]
+                    # We use the [CLS] token (first token) as the global feature
+                    if len(features.shape) == 3:
+                        features = features[:, 0, :]  # Get CLS token
+                    # Move to CPU immediately to free VRAM
+                    features = features.cpu().numpy().flatten()
+                    # Clear CUDA cache if using GPU
+                    if self.device.type == 'cuda':
+                        torch.cuda.empty_cache()
+                elif self.dinov2_processor is not None:
+                    # Using transformers version
+                    inputs = self.dinov2_processor(images=roi_pil, return_tensors="pt")
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                    outputs = self.dinov2_model(**inputs)
+                    # Use pooler_output if available, otherwise use last_hidden_state
+                    if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
+                        features = outputs.pooler_output
+                    else:
+                        # Use CLS token from last hidden state
+                        features = outputs.last_hidden_state[:, 0, :]
+                    # Move to CPU immediately to free VRAM
+                    features = features.cpu().numpy().flatten()
+                    # Clear CUDA cache if using GPU
+                    if self.device.type == 'cuda':
+                        torch.cuda.empty_cache()
+                else:
+                    return np.zeros(128)
+            gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
+            edges = cv2.Canny(gray, 50, 150)
+            edge_density = np.sum(edges > 0) / edges.size
+            # Combine với DINOv2 features
+            features = np.concatenate([features[:120], [edge_density * 100] * 8])
+            return features[:128]  # Take first 128 dimensions for consistency
+        except Exception as e:
+            print(f"DINOv2 feature extraction error: {e}")
+            return np.zeros(128)
     def _extract_geometric_features(self, roi: np.ndarray) -> np.ndarray:
         """Extract geometric invariant features (Hu moments)"""
         ]
         return np.array(features)
     def compute_match_score(self, box1, box2, reid_sim,
+                        alpha=0.6, beta=0.3, gamma=0.1):
         """
         Weighted score combining ReID, IoU, and position
         alpha, beta, gamma = weights
         """
         # IoU
         iou = self.calculate_iou(box1, box2)
         # Tính khoảng cách tâm
         cx1, cy1 = (box1[0] + box1[2]) / 2, (box1[1] + box1[3]) / 2
         cx2, cy2 = (box2[0] + box2[2]) / 2, (box2[1] + box2[3]) / 2
         dist = ((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2) ** 0.5
         # Convert distance → score (0 → 1)
         pos_score = max(0, 1 - dist / self.position_tolerance)
         # Weighted score
         return alpha * reid_sim + beta * iou + gamma * pos_score
         boxes1 = detections1['boxes']
         boxes2 = detections2['boxes']
+        print(f"\n🔍 DEBUG match_damages_with_reid (DINOv2):")
         print(f"   Boxes1: {len(boxes1)}, Boxes2: {len(boxes2)}")
         print(f"   Images provided: {image1 is not None and image2 is not None}")
+        print(f"   DINOv2 available: {self.dinov2_model is not None}")
         if len(boxes1) == 0 and len(boxes2) == 0:
             return {
         if image1 is not None and image2 is not None:
             reid_matrix = np.zeros((len(boxes1), len(boxes2)))
+            print("   Extracting DINOv2 features for damage matching...")
             # Extract features for all boxes
             features1 = [self.extract_damage_features(image1, box) for box in boxes1]
                 for j, feat2 in enumerate(features2):
                     reid_matrix[i, j] = np.dot(feat1, feat2)  # Already normalized
+            print(f"   DINOv2 features extracted successfully")
         if reid_matrix is not None:
             combined_matrix = np.zeros_like(reid_matrix)
         else:
             combined_matrix = iou_matrix
         # Hungarian algorithm for optimal matching
         cost_matrix = 1 - combined_matrix
         row_indices, col_indices = linear_sum_assignment(cost_matrix)
         # Dòng ~560 trong comparison.py
         # Trong match_damages_with_reid, dòng ~560
+       # Trong match_damages_with_reid, dòng 556-571
         for i, j in zip(row_indices, col_indices):
             score = combined_matrix[i, j]
             iou_score = iou_matrix[i, j]
+            # Logic mới - phù hợp với config thực tế
+            if iou_score >= self.iou_threshold:  # >= 0.35 từ config
+                # Good IoU - dùng threshold thấp
+                threshold_to_use = self.combined_score_threshold  # 0.3
+            elif iou_score > 0.1:
+                # Medium IoU - threshold vừa phải
+                threshold_to_use = 0.45
+            elif iou_score > 0.05:
+                # Low IoU - cần score cao hơn
+                threshold_to_use = 0.55
             else:
+                # Very low IoU - cần ReID rất tốt
+                threshold_to_use = 0.65
             print(f"   Pair ({i},{j}): IoU={iou_score:.3f}, Score={score:.3f}, Threshold={threshold_to_use:.3f}")
             if score >= threshold_to_use:
                 if detections1['classes'][i] == detections2['classes'][j]:
                     matched_pairs.append((i, j, score))
                 'repaired': len(existing_damages),
                 'new': len(new_damages),
                 'using_reid': bool(before_image is not None and after_image is not None),
+                'reid_model': 'DINOv2' if self.dinov2_model is not None else 'Traditional'
             }
         }
                                             detections_list: List[Dict],
                                             images_list: List[np.ndarray]) -> Dict:
         """
+        Deduplicate damages across multiple views using DINOv2 features
         Args:
             detections_list: List of detections from different views
             images_list: List of corresponding images
         """
         all_damages = []
+        print(f"Deduplicating damages across {len(images_list)} views using DINOv2...")
         # Collect all damages with their features
         for view_idx, (detections, image) in enumerate(zip(detections_list, images_list)):