Spaces:

weihongliang
/

Personalized-VQA

Sleeping

App Files Files Community

weihongliang commited on Aug 19

Commit

5becd44

verified ·

1 Parent(s): d63e46a

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -30

app.py CHANGED Viewed

@@ -20,10 +20,13 @@ import torch.backends.cudnn as cudnn
 from torchvision import transforms as pth_transforms
 import shutil
 import os
-os.system("wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth")
-os.system("wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth")
 sys.path.append("./segment-anything")
@@ -40,21 +43,29 @@ OBJECT_SAVE_PATH = "./database/Objects/masks"
 FACE_SAVE_PATH = "./database/Faces/masks"
 # Initialize SAM model
 def initialize_sam(sam_checkpoint, model_type="vit_h"):
-    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
-    sam.to(device="cuda" if torch.cuda.is_available() else "cpu")
     return sam
-# Path to the SAM checkpoint
-sam_checkpoint = "./sam_vit_h_4b8939.pth"
-sam = initialize_sam(sam_checkpoint)
-predictor = None
 # Load RADIO model
 model_version = "radio_v2.5-h"  # Using RADIOv2.5-H model (ViT-H/16)
-model = torch.hub.load('NVlabs/RADIO', 'radio_model', version=model_version, progress=True, skip_validation=True)
-model.cuda().eval()
 def extract_features(image_path):
     """Extract features from an image using the RADIO model."""
@@ -140,11 +151,14 @@ def _robust_collate_fn_for_extract_features(batch):
     return image_data_list, batched_indices
 def extract_features(object_dataset, batch_size, num_workers):
      """
      Extracts features from images, handling inputs as paths, PIL Images, or Tensors.
      Assumes `model`, `model_version`, `pil_to_tensor` are in calling scope.
      """
      dataloader = DataLoader(
          object_dataset,
          batch_size=batch_size,
@@ -473,12 +487,18 @@ def navigate_images(is_same_object=False):
     status_text = state.get_status_text()
     return current_image, mask_display, status_text, state.get_gallery(), None  # Return None to clear file upload
 def generate_mask(image, evt: gr.SelectData): # 'image' is the numpy array from the clicked component
-    global predictor
     # Use the image passed by the event!
     if image is None:
         return None, None, "Cannot segment: Image component is empty.", state.get_gallery()
     # Ensure the image is a NumPy array in RGB format (Gradio usually provides this)
     if not isinstance(image, np.ndarray):
@@ -935,7 +955,7 @@ imsize = 224
 args = args_parser.parse_args(args=[
     "--train_path", "./database/Objects/masks",
     "--test_path", "temp_path_placeholder",  # This will be updated during runtime
-    "--pretrained_weights", "./dinov2_vitl14_reg4_pretrain.pth",
     "--output_dir", f"exps/output_RankSelect_{imsize}_mask",  # Default tag, will be updated
 ])
@@ -943,8 +963,15 @@ args = args_parser.parse_args(args=[
 os.makedirs(args.output_dir, exist_ok=True)
 #model, autocast_dtype = setup_and_build_model(args)
 def detect_objects(input_img, score_threshold=0.52, tag="mask"):
     """Main function to detect objects in an image"""
     # Create temporary file for the input image
     with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as f:
         temp_path = f.name
@@ -1256,6 +1283,7 @@ def detect_objects(input_img, score_threshold=0.52, tag="mask"):
 # ===== FACE DETECTION AND RECOGNITION PART =====
 # Initialize face detection and recognition models
 def initialize_face_models():
     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
     mtcnn = MTCNN(
@@ -1351,6 +1379,7 @@ def get_face_embeddings(face_dir=FACE_SAVE_PATH):
     return embeddings, face_names, face_paths, face_anns
 # Detect and recognize faces in an image
 def detect_faces(input_img, score_threshold=0.7):
     mtcnn, resnet, device = initialize_face_models()
@@ -1504,6 +1533,7 @@ def match_faces_stable_matching(face_embeddings, detected_embeddings, score_thre
     return matches, similarities
 # 1. Add the combined detection function
 def combined_detection(img, obj_threshold, face_threshold, tag):
     """
     Run both object detection and face detection on the same image
@@ -1599,37 +1629,40 @@ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 # Load model and processor at the application level for reuse
 def load_qwen2vl_model():
     model = Qwen2VLForConditionalGeneration.from_pretrained(
-        #"/mnt/14T-disk/code/Contextual_Referring_Understanding/LLaMA-Factory/models/qwen2_vl_7b_citation_lora_sft_face_3/goodcaption-20000",
-        "/mnt/14T-disk/code/Contextual_Referring_Understanding/LLaMA-Factory/models/qwen2_vl_2b_citation_lora_sft_face_3/goodcaption-20000",
         torch_dtype=torch.bfloat16,
-        device_map="cuda:0"
     )
     min_pixels = 256 * 28 * 28
     max_pixels = 1280 * 28 * 28
     processor = AutoProcessor.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
     )
-    #processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
     return model, processor
-# Try to load the model, but handle errors if it fails
-try:
-    qwen_model, qwen_processor = load_qwen2vl_model()
-    qwen_model_loaded = True
-except Exception as e:
-    print(f"Failed to load Qwen2-VL model: {e}")
-    qwen_model_loaded = False
 # Function to process detection results and use Qwen2-VL for answering questions
 def ask_qwen_about_detections(input_image, question, obj_threshold, face_threshold, tag):
     """
     Process an image with detection and use Qwen2-VL to answer questions
     """
-    # Check if the model is loaded
-    if not qwen_model_loaded:
-        return "Qwen2-VL model not loaded. Please check console for errors.", None, None
     # Get detection results and formatted text
     qwen_input, output_img = process_image_for_qwen(input_image, obj_threshold, face_threshold, tag)
@@ -2146,7 +2179,7 @@ with gr.Blocks() as app:
                 lines=2
             )
-            qwen_ask_button = gr.Button("Ask RC-MLLM-7B")
         with gr.Column():
             qwen_output_image = gr.Image(label="Detection Result")
@@ -2174,8 +2207,7 @@ with gr.Blocks() as app:
     # Model status display
     model_status = gr.Markdown(
-        "✅ RC-MLLM model loaded successfully" if qwen_model_loaded else
-        "❌ RC-MLLM model not loaded. Please check console for errors."
     )
     # Instructions for RC-MLLM section

 from torchvision import transforms as pth_transforms
 import shutil
 import os
+import spaces
+# Download SAM checkpoint if not exists
+import subprocess
+if not os.path.exists("./sam_vit_h_4b8939.pth"):
+    subprocess.run(["wget", "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"])
 sys.path.append("./segment-anything")
 FACE_SAVE_PATH = "./database/Faces/masks"
 # Initialize SAM model
+sam = None
+predictor = None
+@spaces.GPU
 def initialize_sam(sam_checkpoint, model_type="vit_h"):
+    global sam
+    if sam is None:
+        sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+        sam.to(device="cuda" if torch.cuda.is_available() else "cpu")
     return sam
 # Load RADIO model
 model_version = "radio_v2.5-h"  # Using RADIOv2.5-H model (ViT-H/16)
+model = None  # Initialize as None, will be loaded when needed
+@spaces.GPU
+def load_radio_model():
+    global model
+    if model is None:
+        model = torch.hub.load('NVlabs/RADIO', 'radio_model', version=model_version, progress=True, skip_validation=True)
+        model.cuda().eval()
+    return model
 def extract_features(image_path):
     """Extract features from an image using the RADIO model."""
     return image_data_list, batched_indices
+@spaces.GPU
 def extract_features(object_dataset, batch_size, num_workers):
      """
      Extracts features from images, handling inputs as paths, PIL Images, or Tensors.
      Assumes `model`, `model_version`, `pil_to_tensor` are in calling scope.
      """
+     # Ensure model is loaded
+     model = load_radio_model()
      dataloader = DataLoader(
          object_dataset,
          batch_size=batch_size,
     status_text = state.get_status_text()
     return current_image, mask_display, status_text, state.get_gallery(), None  # Return None to clear file upload
+@spaces.GPU
 def generate_mask(image, evt: gr.SelectData): # 'image' is the numpy array from the clicked component
+    global predictor, sam
     # Use the image passed by the event!
     if image is None:
         return None, None, "Cannot segment: Image component is empty.", state.get_gallery()
+    # Initialize SAM if not already done
+    if sam is None:
+        sam_checkpoint = "./sam_vit_h_4b8939.pth"
+        sam = initialize_sam(sam_checkpoint)
     # Ensure the image is a NumPy array in RGB format (Gradio usually provides this)
     if not isinstance(image, np.ndarray):
 args = args_parser.parse_args(args=[
     "--train_path", "./database/Objects/masks",
     "--test_path", "temp_path_placeholder",  # This will be updated during runtime
+    "--pretrained_weights", "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_reg4_pretrain.pth",
     "--output_dir", f"exps/output_RankSelect_{imsize}_mask",  # Default tag, will be updated
 ])
 os.makedirs(args.output_dir, exist_ok=True)
 #model, autocast_dtype = setup_and_build_model(args)
+@spaces.GPU
 def detect_objects(input_img, score_threshold=0.52, tag="mask"):
     """Main function to detect objects in an image"""
+    global sam
+    # Initialize SAM if not already done
+    if sam is None:
+        sam_checkpoint = "./sam_vit_h_4b8939.pth"
+        sam = initialize_sam(sam_checkpoint)
     # Create temporary file for the input image
     with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as f:
         temp_path = f.name
 # ===== FACE DETECTION AND RECOGNITION PART =====
 # Initialize face detection and recognition models
+@spaces.GPU
 def initialize_face_models():
     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
     mtcnn = MTCNN(
     return embeddings, face_names, face_paths, face_anns
 # Detect and recognize faces in an image
+@spaces.GPU
 def detect_faces(input_img, score_threshold=0.7):
     mtcnn, resnet, device = initialize_face_models()
     return matches, similarities
 # 1. Add the combined detection function
+@spaces.GPU
 def combined_detection(img, obj_threshold, face_threshold, tag):
     """
     Run both object detection and face detection on the same image
 from qwen_vl_utils import process_vision_info
 # Load model and processor at the application level for reuse
+@spaces.GPU
 def load_qwen2vl_model():
     model = Qwen2VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2-VL-2B-Instruct",  # Use the base model for HF Spaces
         torch_dtype=torch.bfloat16,
+        device_map="auto"
     )
     min_pixels = 256 * 28 * 28
     max_pixels = 1280 * 28 * 28
     processor = AutoProcessor.from_pretrained(
         "Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
     )
     return model, processor
+# Initialize model variables
+qwen_model = None
+qwen_processor = None
+qwen_model_loaded = False
 # Function to process detection results and use Qwen2-VL for answering questions
+@spaces.GPU
 def ask_qwen_about_detections(input_image, question, obj_threshold, face_threshold, tag):
     """
     Process an image with detection and use Qwen2-VL to answer questions
     """
+    global qwen_model, qwen_processor, qwen_model_loaded
+    # Load model if not already loaded
+    if qwen_model is None:
+        try:
+            qwen_model, qwen_processor = load_qwen2vl_model()
+            qwen_model_loaded = True
+        except Exception as e:
+            return f"Failed to load Qwen2-VL model: {e}", None, None
     # Get detection results and formatted text
     qwen_input, output_img = process_image_for_qwen(input_image, obj_threshold, face_threshold, tag)
                 lines=2
             )
+            qwen_ask_button = gr.Button("Ask RC-MLLM-2B")
         with gr.Column():
             qwen_output_image = gr.Image(label="Detection Result")
     # Model status display
     model_status = gr.Markdown(
+        "🔄 RC-MLLM model will be loaded when first used (ZeroGPU)"
     )
     # Instructions for RC-MLLM section