Spaces:

yonigozlan
/

Segment-Anything-2-video-tracking

Running on Zero

App Files Files

xet

Community

yonigozlan HF Staff commited on 23 days ago

Commit

942c318

1 Parent(s): 5b5416f

load video with cv2

Browse files

Files changed (1) hide show

app.py +29 -64

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gc
 from copy import deepcopy
 from typing import Optional
 import gradio as gr
 import numpy as np
 import spaces
@@ -10,7 +11,6 @@ import torch
 from gradio.themes import Soft
 from PIL import Image, ImageDraw
-# Prefer local transformers in the workspace
 from transformers import AutoModel, Sam2VideoProcessor
@@ -32,56 +32,25 @@ def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], di
     """Load video frames as PIL Images using transformers.video_utils if available,
     otherwise fall back to OpenCV. Returns (frames, info).
     """
-    try:
-        from transformers.video_utils import load_video  # type: ignore
-        frames, info = load_video(video_path_or_url)
-        # Ensure PIL format
-        pil_frames = []
-        for fr in frames:
-            if isinstance(fr, Image.Image):
-                pil_frames.append(fr.convert("RGB"))
-            else:
-                pil_frames.append(Image.fromarray(fr).convert("RGB"))
-        info = info if info is not None else {}
-        # Ensure fps present when possible (fallback to cv2 probe)
-        if "fps" not in info or not info.get("fps"):
-            try:
-                import cv2  # type: ignore
-                cap = cv2.VideoCapture(video_path_or_url)
-                fps_val = cap.get(cv2.CAP_PROP_FPS)
-                cap.release()
-                if fps_val and fps_val > 0:
-                    info["fps"] = float(fps_val)
-            except Exception as e:
-                print(f"Failed to render video with cv2: {e}")
-                pass
-        return pil_frames, info
-    except Exception as e:
-        print(f"Failed to load video with transformers.video_utils: {e}")
-        # Fallback to OpenCV
-        try:
-            import cv2  # type: ignore
-            cap = cv2.VideoCapture(video_path_or_url)
-            frames = []
-            while cap.isOpened():
-                ret, frame = cap.read()
-                if not ret:
-                    break
-                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                frames.append(Image.fromarray(frame_rgb))
-            # Gather fps if available
-            fps_val = cap.get(cv2.CAP_PROP_FPS)
-            cap.release()
-            info = {
-                "num_frames": len(frames),
-                "fps": float(fps_val) if fps_val and fps_val > 0 else None,
-            }
-            return frames, info
-        except Exception as e:
-            raise RuntimeError(f"Failed to load video: {e}")
 def overlay_masks_on_frame(
@@ -196,14 +165,12 @@ def load_model_if_needed(GLOBAL_STATE: gr.State) -> tuple[AutoModel, Sam2VideoPr
     GLOBAL_STATE.dtype = dtype
     GLOBAL_STATE.model_repo_id = desired_repo
-    return model, processor, device, dtype
 def ensure_session_for_current_model(GLOBAL_STATE: gr.State) -> None:
     """Ensure the model/processor match the selected repo and inference_session exists.
     If a video is already loaded, re-initialize the inference session when needed.
     """
-    model, processor, device, dtype = load_model_if_needed(GLOBAL_STATE)
     desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
     if GLOBAL_STATE.inference_session is None or GLOBAL_STATE.session_repo_id != desired_repo:
         if GLOBAL_STATE.video_frames:
@@ -213,10 +180,10 @@ def ensure_session_for_current_model(GLOBAL_STATE: gr.State) -> None:
             GLOBAL_STATE.boxes_by_frame_obj.clear()
             GLOBAL_STATE.composited_frames.clear()
             GLOBAL_STATE.inference_session = None
-            GLOBAL_STATE.inference_session = processor.init_video_session(
-                inference_device=device,
                 video_storage_device="cpu",
-                dtype=dtype,
             )
             GLOBAL_STATE.session_repo_id = desired_repo
@@ -229,7 +196,7 @@ def init_video_session(GLOBAL_STATE: gr.State, video: str | dict) -> tuple[AppSt
     GLOBAL_STATE.masks_by_frame = {}
     GLOBAL_STATE.color_by_obj = {}
-    model, processor, device, dtype = load_model_if_needed(GLOBAL_STATE)
     # Gradio Video may provide a dict with 'name' or a direct file path
     video_path: Optional[str] = None
@@ -261,10 +228,10 @@ def init_video_session(GLOBAL_STATE: gr.State, video: str | dict) -> tuple[AppSt
     # Try to capture original FPS if provided by loader
     GLOBAL_STATE.video_fps = float(fps_in)
     # Initialize session
-    inference_session = processor.init_video_session(
-        inference_device=device,
         video_storage_device="cpu",
-        dtype=dtype,
     )
     GLOBAL_STATE.inference_session = inference_session
@@ -272,7 +239,7 @@ def init_video_session(GLOBAL_STATE: gr.State, video: str | dict) -> tuple[AppSt
     max_idx = len(frames) - 1
     status = (
         f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
-        f"Device: {device}, dtype: bfloat16"
     )
     return GLOBAL_STATE, 0, max_idx, first_frame, status
@@ -749,8 +716,6 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
         out_path = "/tmp/sam2_playback.mp4"
         # Prefer imageio with PyAV/ffmpeg to respect exact fps
         try:
-            import cv2  # type: ignore
             fourcc = cv2.VideoWriter_fourcc(*"mp4v")
             writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
             for fr_bgr in frames_np:

 from copy import deepcopy
 from typing import Optional
+import cv2
 import gradio as gr
 import numpy as np
 import spaces
 from gradio.themes import Soft
 from PIL import Image, ImageDraw
 from transformers import AutoModel, Sam2VideoProcessor
     """Load video frames as PIL Images using transformers.video_utils if available,
     otherwise fall back to OpenCV. Returns (frames, info).
     """
+    cap = cv2.VideoCapture(video_path_or_url)
+    frames = []
+    print("loading video frames")
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames.append(Image.fromarray(frame_rgb))
+    # Gather fps if available
+    fps_val = cap.get(cv2.CAP_PROP_FPS)
+    cap.release()
+    print("loaded video frames")
+    info = {
+        "num_frames": len(frames),
+        "fps": float(fps_val) if fps_val and fps_val > 0 else None,
+    }
+    return frames, info
 def overlay_masks_on_frame(
     GLOBAL_STATE.dtype = dtype
     GLOBAL_STATE.model_repo_id = desired_repo
 def ensure_session_for_current_model(GLOBAL_STATE: gr.State) -> None:
     """Ensure the model/processor match the selected repo and inference_session exists.
     If a video is already loaded, re-initialize the inference session when needed.
     """
+    load_model_if_needed(GLOBAL_STATE)
     desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
     if GLOBAL_STATE.inference_session is None or GLOBAL_STATE.session_repo_id != desired_repo:
         if GLOBAL_STATE.video_frames:
             GLOBAL_STATE.boxes_by_frame_obj.clear()
             GLOBAL_STATE.composited_frames.clear()
             GLOBAL_STATE.inference_session = None
+            GLOBAL_STATE.inference_session = GLOBAL_STATE.processor.init_video_session(
+                inference_device=GLOBAL_STATE.device,
                 video_storage_device="cpu",
+                dtype=GLOBAL_STATE.dtype,
             )
             GLOBAL_STATE.session_repo_id = desired_repo
     GLOBAL_STATE.masks_by_frame = {}
     GLOBAL_STATE.color_by_obj = {}
+    load_model_if_needed(GLOBAL_STATE)
     # Gradio Video may provide a dict with 'name' or a direct file path
     video_path: Optional[str] = None
     # Try to capture original FPS if provided by loader
     GLOBAL_STATE.video_fps = float(fps_in)
     # Initialize session
+    inference_session = GLOBAL_STATE.processor.init_video_session(
+        inference_device=GLOBAL_STATE.device,
         video_storage_device="cpu",
+        dtype=GLOBAL_STATE.dtype,
     )
     GLOBAL_STATE.inference_session = inference_session
     max_idx = len(frames) - 1
     status = (
         f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
+        f"Device: {GLOBAL_STATE.device}, dtype: bfloat16"
     )
     return GLOBAL_STATE, 0, max_idx, first_frame, status
         out_path = "/tmp/sam2_playback.mp4"
         # Prefer imageio with PyAV/ffmpeg to respect exact fps
         try:
             fourcc = cv2.VideoWriter_fourcc(*"mp4v")
             writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
             for fr_bgr in frames_np: