Spaces:

Dhruv-Ty
/

MedSam

Running

App Files Files Community

Dhruv-Ty commited on Apr 18

Commit

ac239ba

1 Parent(s): 8ce0600

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +201 -0
app.py +948 -5
checkpoints/MedSAM2_2411.pt +3 -0
checkpoints/MedSAM2_CTLesion.pt +3 -0
checkpoints/MedSAM2_MRI_LiverLesion.pt +3 -0
checkpoints/MedSAM2_US_Heart.pt +3 -0
checkpoints/MedSAM2_latest.pt +3 -0
checkpoints/README.md +10 -0
download.sh +35 -0
download_checkpoints.py +21 -0
gitignore +13 -0
medsam2_infer_3D_CT.py +304 -0
medsam2_infer_video.py +570 -0
multi_node_train.sh +48 -0
notebooks/MedSAM2_Inference_Video.ipynb +0 -0
notebooks/MedSAM2_inference_CT_Lesion.ipynb +0 -0
pyproject.toml +6 -0
requirements.txt +16 -0
sam2/__init__.py +11 -0
sam2/__pycache__/__init__.cpython-312.pyc +0 -0
sam2/__pycache__/build_sam.cpython-312.pyc +0 -0
sam2/__pycache__/sam2_image_predictor.cpython-312.pyc +0 -0
sam2/__pycache__/sam2_video_predictor_npz.cpython-312.pyc +0 -0
sam2/build_sam.py +207 -0
sam2/configs/sam2.1_hiera_t512.yaml +121 -0
sam2/configs/sam2.1_hiera_tiny_finetune512.yaml +389 -0
sam2/csrc/connected_components.cu +289 -0
sam2/modeling/__init__.py +5 -0
sam2/modeling/__pycache__/__init__.cpython-312.pyc +0 -0
sam2/modeling/__pycache__/memory_attention.cpython-312.pyc +0 -0
sam2/modeling/__pycache__/memory_encoder.cpython-312.pyc +0 -0
sam2/modeling/__pycache__/position_encoding.cpython-312.pyc +0 -0
sam2/modeling/__pycache__/sam2_base.cpython-312.pyc +0 -0
sam2/modeling/__pycache__/sam2_utils.cpython-312.pyc +0 -0
sam2/modeling/backbones/__init__.py +5 -0
sam2/modeling/backbones/__pycache__/__init__.cpython-312.pyc +0 -0
sam2/modeling/backbones/__pycache__/hieradet.cpython-312.pyc +0 -0
sam2/modeling/backbones/__pycache__/image_encoder.cpython-312.pyc +0 -0
sam2/modeling/backbones/__pycache__/utils.cpython-312.pyc +0 -0
sam2/modeling/backbones/hieradet.py +317 -0
sam2/modeling/backbones/image_encoder.py +134 -0
sam2/modeling/backbones/utils.py +95 -0
sam2/modeling/memory_attention.py +169 -0
sam2/modeling/memory_encoder.py +181 -0
sam2/modeling/position_encoding.py +221 -0
sam2/modeling/sam/__init__.py +5 -0
sam2/modeling/sam/__pycache__/__init__.cpython-312.pyc +0 -0
sam2/modeling/sam/__pycache__/mask_decoder.cpython-312.pyc +0 -0
sam2/modeling/sam/__pycache__/prompt_encoder.cpython-312.pyc +0 -0
sam2/modeling/sam/__pycache__/transformer.cpython-312.pyc +0 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

app.py CHANGED Viewed

@@ -1,7 +1,950 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+"""
+Gradio app for interactive medical video segmentation using MedSAM2.
+Please use gradio==3.38.0
+"""
+import datetime
+import gc
+from glob import glob
+import hashlib
+import math
+import multiprocessing as mp
+import platform
+import os
+from os.path import basename, splitext, dirname
+import threading
+import time
+os.environ["TORCH_CUDNN_SDPA_ENABLED"] = "1"
+import shutil
+import ffmpeg
+from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
+import zipfile
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from PIL import Image
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+from sam2.build_sam import build_sam2_video_predictor
+import cv2
+user_processes = {}
+PROCESS_TIMEOUT = datetime.timedelta(minutes=15)
+def reset(seg_tracker):
+    if seg_tracker is not None:
+        predictor, inference_state, image_predictor = seg_tracker
+        predictor.reset_state(inference_state)
+        del predictor
+        del inference_state
+        del image_predictor
+        del seg_tracker
+        gc.collect()
+        torch.cuda.empty_cache()
+    return None, ({}, {}), None, None, 0, None, None, None, 0, 0,
+def extract_video_info(input_video):
+    if input_video is None:
+        return 4, 4, None, None, None, None, None
+    cap = cv2.VideoCapture(input_video)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap.release()
+    return fps, total_frames, None, None, None, None, None
+def get_meta_from_video(session_id, input_video, scale_slider, config_path, checkpoint_path):
+    output_dir = f'/tmp/output_frames/{session_id}'
+    output_masks_dir = f'/tmp/output_masks/{session_id}'
+    output_combined_dir = f'/tmp/output_combined/{session_id}'
+    clear_folder(output_dir)
+    clear_folder(output_masks_dir)
+    clear_folder(output_combined_dir)
+    if input_video is None:
+        return None, ({}, {}), None, None, (4, 1, 4), None, None, None, 0, 0
+    cap = cv2.VideoCapture(input_video)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap.release()
+    frame_interval = max(1, int(fps // scale_slider))
+    print(f"frame_interval: {frame_interval}")
+    try:
+        ffmpeg.input(input_video, hwaccel='cuda').output(
+            os.path.join(output_dir, '%07d.jpg'), q=2, start_number=0,
+            vf=rf'select=not(mod(n\,{frame_interval}))', vsync='vfr'
+        ).run()
+    except:
+        print(f"ffmpeg cuda err")
+        ffmpeg.input(input_video).output(
+            os.path.join(output_dir, '%07d.jpg'), q=2, start_number=0,
+            vf=rf'select=not(mod(n\,{frame_interval}))', vsync='vfr'
+        ).run()
+    first_frame_path = os.path.join(output_dir, '0000000.jpg')
+    first_frame = cv2.imread(first_frame_path)
+    first_frame_rgb = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB)
+    torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+    if torch.cuda.get_device_properties(0).major >= 8:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    predictor = build_sam2_video_predictor(config_path, checkpoint_path, device="cuda")
+    sam2_model = build_sam2(config_path, checkpoint_path, device="cuda")
+    image_predictor = SAM2ImagePredictor(sam2_model)
+    inference_state = predictor.init_state(video_path=output_dir)
+    predictor.reset_state(inference_state)
+    return (predictor, inference_state, image_predictor), ({}, {}), first_frame_rgb, first_frame_rgb, (fps, frame_interval, total_frames), None, None, None, 0, 0
+def mask2bbox(mask):
+    if len(np.where(mask > 0)[0]) == 0:
+        print(f'not mask')
+        return np.array([0, 0, 0, 0]).astype(np.int64), False
+    x_ = np.sum(mask, axis=0)
+    y_ = np.sum(mask, axis=1)
+    x0 = np.min(np.nonzero(x_)[0])
+    x1 = np.max(np.nonzero(x_)[0])
+    y0 = np.min(np.nonzero(y_)[0])
+    y1 = np.max(np.nonzero(y_)[0])
+    return np.array([x0, y0, x1, y1]).astype(np.int64), True
+def sam_stroke(session_id, seg_tracker, drawing_board, last_draw, frame_num, ann_obj_id):
+    predictor, inference_state, image_predictor = seg_tracker
+    image_path = f'/tmp/output_frames/{session_id}/{frame_num:07d}.jpg'
+    image = cv2.imread(image_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    display_image = drawing_board["image"]
+    image_predictor.set_image(image)
+    input_mask = drawing_board["mask"]
+    input_mask[input_mask != 0] = 255
+    if last_draw is not None:
+        diff_mask = cv2.absdiff(input_mask, last_draw)
+        input_mask = diff_mask
+    bbox, hasMask = mask2bbox(input_mask[:, :, 0])
+    if not hasMask :
+        return seg_tracker, display_image, display_image, None
+    masks, scores, logits = image_predictor.predict( point_coords=None, point_labels=None, box=bbox[None, :], multimask_output=False,)
+    mask = masks > 0.0
+    masked_frame = show_mask(mask, display_image, ann_obj_id)
+    masked_with_rect = draw_rect(masked_frame, bbox, ann_obj_id)
+    frame_idx, object_ids, masks = predictor.add_new_mask(inference_state, frame_idx=frame_num, obj_id=ann_obj_id, mask=mask[0])
+    last_draw = drawing_board["mask"]
+    return seg_tracker, masked_with_rect, masked_with_rect, last_draw
+def draw_rect(image, bbox, obj_id):
+    cmap = plt.get_cmap("tab10")
+    color = np.array(cmap(obj_id)[:3])
+    rgb_color = tuple(map(int, (color[:3] * 255).astype(np.uint8)))
+    inv_color = tuple(map(int, (255 - color[:3] * 255).astype(np.uint8)))
+    x0, y0, x1, y1 = bbox
+    image_with_rect = cv2.rectangle(image.copy(), (x0, y0), (x1, y1), rgb_color, thickness=2)
+    return image_with_rect
+def sam_click(session_id, seg_tracker, frame_num, point_mode, click_stack, ann_obj_id, point):
+    points_dict, labels_dict = click_stack
+    predictor, inference_state, image_predictor = seg_tracker
+    ann_frame_idx = frame_num  # the frame index we interact with
+    print(f'ann_frame_idx: {ann_frame_idx}')
+    if point_mode == "Positive":
+        label = np.array([1], np.int32)
+    else:
+        label = np.array([0], np.int32)
+    if ann_frame_idx not in points_dict:
+        points_dict[ann_frame_idx] = {}
+    if ann_frame_idx not in labels_dict:
+        labels_dict[ann_frame_idx] = {}
+    if ann_obj_id not in points_dict[ann_frame_idx]:
+        points_dict[ann_frame_idx][ann_obj_id] = np.empty((0, 2), dtype=np.float32)
+    if ann_obj_id not in labels_dict[ann_frame_idx]:
+        labels_dict[ann_frame_idx][ann_obj_id] = np.empty((0,), dtype=np.int32)
+    points_dict[ann_frame_idx][ann_obj_id] = np.append(points_dict[ann_frame_idx][ann_obj_id], point, axis=0)
+    labels_dict[ann_frame_idx][ann_obj_id] = np.append(labels_dict[ann_frame_idx][ann_obj_id], label, axis=0)
+    click_stack = (points_dict, labels_dict)
+    frame_idx, out_obj_ids, out_mask_logits = predictor.add_new_points(
+        inference_state=inference_state,
+        frame_idx=ann_frame_idx,
+        obj_id=ann_obj_id,
+        points=points_dict[ann_frame_idx][ann_obj_id],
+        labels=labels_dict[ann_frame_idx][ann_obj_id],
+    )
+    image_path = f'/tmp/output_frames/{session_id}/{ann_frame_idx:07d}.jpg'
+    image = cv2.imread(image_path)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    masked_frame = image.copy()
+    for i, obj_id in enumerate(out_obj_ids):
+        mask = (out_mask_logits[i] > 0.0).cpu().numpy()
+        masked_frame = show_mask(mask, image=masked_frame, obj_id=obj_id)
+    masked_frame_with_markers = draw_markers(masked_frame, points_dict[ann_frame_idx], labels_dict[ann_frame_idx])
+    return seg_tracker, masked_frame_with_markers, masked_frame_with_markers, click_stack
+def draw_markers(image, points_dict, labels_dict):
+    cmap = plt.get_cmap("tab10")
+    image_h, image_w = image.shape[:2]
+    marker_size = max(1, int(min(image_h, image_w) * 0.05))
+    for obj_id in points_dict:
+        color = np.array(cmap(obj_id)[:3])
+        rgb_color = tuple(map(int, (color[:3] * 255).astype(np.uint8)))
+        inv_color = tuple(map(int, (255 - color[:3] * 255).astype(np.uint8)))
+        for point, label in zip(points_dict[obj_id], labels_dict[obj_id]):
+            x, y = int(point[0]), int(point[1])
+            if label == 1:
+                cv2.drawMarker(image, (x, y), inv_color, markerType=cv2.MARKER_CROSS, markerSize=marker_size, thickness=2)
+            else:
+                cv2.drawMarker(image, (x, y), inv_color, markerType=cv2.MARKER_TILTED_CROSS, markerSize=int(marker_size / np.sqrt(2)), thickness=2)
+    return image
+def show_mask(mask, image=None, obj_id=None):
+    cmap = plt.get_cmap("tab10")
+    cmap_idx = 0 if obj_id is None else obj_id
+    color = np.array([*cmap(cmap_idx)[:3], 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    mask_image = (mask_image * 255).astype(np.uint8)
+    if image is not None:
+        image_h, image_w = image.shape[:2]
+        if (image_h, image_w) != (h, w):
+            raise ValueError(f"Image dimensions ({image_h}, {image_w}) and mask dimensions ({h}, {w}) do not match")
+        colored_mask = np.zeros_like(image, dtype=np.uint8)
+        for c in range(3):
+            colored_mask[..., c] = mask_image[..., c]
+        alpha_mask = mask_image[..., 3] / 255.0
+        for c in range(3):
+            image[..., c] = np.where(alpha_mask > 0, (1 - alpha_mask) * image[..., c] + alpha_mask * colored_mask[..., c], image[..., c])
+        return image
+    return mask_image
+def show_res_by_slider(session_id, frame_per, click_stack):
+    image_path = f'/tmp/output_frames/{session_id}'
+    output_combined_dir = f'/tmp/output_combined/{session_id}'
+    combined_frames = sorted([os.path.join(output_combined_dir, img_name) for img_name in os.listdir(output_combined_dir)])
+    if combined_frames:
+        output_masked_frame_path = combined_frames
+    else:
+        original_frames = sorted([os.path.join(image_path, img_name) for img_name in os.listdir(image_path)])
+        output_masked_frame_path = original_frames
+    total_frames_num = len(output_masked_frame_path)
+    if total_frames_num == 0:
+        print("No output results found")
+        return None, None, 0
+    else:
+        frame_num = math.floor(total_frames_num * frame_per)
+        if frame_num >= total_frames_num:
+            frame_num = total_frames_num - 1
+        chosen_frame_path = output_masked_frame_path[frame_num]
+        print(f"{chosen_frame_path}")
+        chosen_frame_show = cv2.imread(chosen_frame_path)
+        chosen_frame_show = cv2.cvtColor(chosen_frame_show, cv2.COLOR_BGR2RGB)
+        points_dict, labels_dict = click_stack
+        if frame_num in points_dict and frame_num in labels_dict:
+            chosen_frame_show = draw_markers(chosen_frame_show, points_dict[frame_num], labels_dict[frame_num])
+        return chosen_frame_show, chosen_frame_show, frame_num
+def clear_folder(folder_path):
+    if os.path.exists(folder_path):
+        shutil.rmtree(folder_path)
+    os.makedirs(folder_path)
+def zip_folder(folder_path, output_zip_path):
+    with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_STORED) as zipf:
+        for root, _, files in os.walk(folder_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                zipf.write(file_path, os.path.relpath(file_path, folder_path))
+def tracking_objects(session_id, seg_tracker, frame_num, input_video):
+    output_dir = f'/tmp/output_frames/{session_id}'
+    output_masks_dir = f'/tmp/output_masks/{session_id}'
+    output_combined_dir = f'/tmp/output_combined/{session_id}'
+    output_files_dir = f'/tmp/output_files/{session_id}'
+    output_video_path = f'{output_files_dir}/output_video.mp4'
+    output_zip_path = f'{output_files_dir}/output_masks.zip'
+    clear_folder(output_masks_dir)
+    clear_folder(output_combined_dir)
+    clear_folder(output_files_dir)
+    video_segments = {}
+    predictor, inference_state, image_predictor = seg_tracker
+    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
+        video_segments[out_frame_idx] = {
+            out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+            for i, out_obj_id in enumerate(out_obj_ids)
+        }
+    frame_files = sorted([f for f in os.listdir(output_dir) if f.endswith('.jpg')])
+    # for frame_idx in sorted(video_segments.keys()):
+    for frame_file in frame_files:
+        frame_idx = int(os.path.splitext(frame_file)[0])
+        frame_path = os.path.join(output_dir, frame_file)
+        image = cv2.imread(frame_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        masked_frame = image.copy()
+        if frame_idx in video_segments:
+            for obj_id, mask in video_segments[frame_idx].items():
+                masked_frame = show_mask(mask, image=masked_frame, obj_id=obj_id)
+                mask_output_path = os.path.join(output_masks_dir, f'{obj_id}_{frame_idx:07d}.png')
+                cv2.imwrite(mask_output_path, show_mask(mask))
+        combined_output_path = os.path.join(output_combined_dir, f'{frame_idx:07d}.png')
+        combined_image_bgr = cv2.cvtColor(masked_frame, cv2.COLOR_RGB2BGR)
+        cv2.imwrite(combined_output_path, combined_image_bgr)
+        if frame_idx == frame_num:
+            final_masked_frame = masked_frame
+    cap = cv2.VideoCapture(input_video)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    cap.release()
+    # output_frames = int(total_frames * scale_slider)
+    output_frames = len([name for name in os.listdir(output_combined_dir) if os.path.isfile(os.path.join(output_combined_dir, name)) and name.endswith('.png')])
+    out_fps = fps * output_frames / total_frames
+    # ffmpeg.input(os.path.join(output_combined_dir, '%07d.png'), framerate=out_fps).output(output_video_path, vcodec='h264_nvenc', pix_fmt='yuv420p').run()
+    # fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    # out = cv2.VideoWriter(output_video_path, fourcc, out_fps, (frame_width, frame_height))
+    # for i in range(output_frames):
+    #     frame_path = os.path.join(output_combined_dir, f'{i:07d}.png')
+    #     frame = cv2.imread(frame_path)
+    #     out.write(frame)
+    # out.release()
+    image_files = [os.path.join(output_combined_dir, f'{i:07d}.png') for i in range(output_frames)]
+    clip = ImageSequenceClip(image_files, fps=out_fps)
+    clip.write_videofile(output_video_path, codec="libx264", fps=out_fps)
+    zip_folder(output_masks_dir, output_zip_path)
+    print("done")
+    return final_masked_frame, final_masked_frame, output_video_path, output_video_path, output_zip_path, ({}, {})
+def increment_ann_obj_id(max_obj_id):
+    max_obj_id += 1
+    ann_obj_id = max_obj_id
+    return ann_obj_id, max_obj_id
+def update_current_id(ann_obj_id):
+    return ann_obj_id
+def drawing_board_get_input_first_frame(input_first_frame):
+    return input_first_frame
+def process_video(queue, result_queue, session_id):
+    seg_tracker = None
+    click_stack = ({}, {})
+    frame_num = int(0)
+    ann_obj_id = int(0)
+    last_draw = None
+    while True:
+        task = queue.get()
+        if task["command"] == "exit":
+            print(f"Process for {session_id} exiting.")
+            break
+        elif task["command"] == "extract_video_info":
+            input_video = task["input_video"]
+            fps, total_frames, input_first_frame, drawing_board, output_video, output_mp4, output_mask = extract_video_info(input_video)
+            result_queue.put({"fps": fps, "total_frames": total_frames, "input_first_frame": input_first_frame, "drawing_board": drawing_board, "output_video": output_video, "output_mp4": output_mp4, "output_mask": output_mask})
+        elif task["command"] == "get_meta_from_video":
+            input_video = task["input_video"]
+            scale_slider = task["scale_slider"]
+            config_path = task["config_path"]
+            checkpoint_path = task["checkpoint_path"]
+            seg_tracker, click_stack, input_first_frame, drawing_board, frame_per, output_video, output_mp4, output_mask, ann_obj_id, max_obj_id = get_meta_from_video(session_id, input_video, scale_slider, config_path, checkpoint_path)
+            result_queue.put({"input_first_frame": input_first_frame, "drawing_board": drawing_board, "frame_per": frame_per, "output_video": output_video, "output_mp4": output_mp4, "output_mask": output_mask, "ann_obj_id": ann_obj_id, "max_obj_id": max_obj_id})
+        elif task["command"] == "sam_stroke":
+            drawing_board = task["drawing_board"]
+            last_draw = task["last_draw"]
+            frame_num = task["frame_num"]
+            ann_obj_id = task["ann_obj_id"]
+            seg_tracker, input_first_frame, drawing_board, last_draw = sam_stroke(session_id, seg_tracker, drawing_board, last_draw, frame_num, ann_obj_id)
+            result_queue.put({"input_first_frame": input_first_frame, "drawing_board": drawing_board, "last_draw": last_draw})
+        elif task["command"] == "sam_click":
+            frame_num = task["frame_num"]
+            point_mode = task["point_mode"]
+            click_stack = task["click_stack"]
+            ann_obj_id = task["ann_obj_id"]
+            point = task["point"]
+            seg_tracker, input_first_frame, drawing_board, last_draw = sam_click(session_id, seg_tracker, frame_num, point_mode, click_stack, ann_obj_id, point)
+            result_queue.put({"input_first_frame": input_first_frame, "drawing_board": drawing_board, "last_draw": last_draw})
+        elif task["command"] == "increment_ann_obj_id":
+            max_obj_id = task["max_obj_id"]
+            ann_obj_id, max_obj_id = increment_ann_obj_id(max_obj_id)
+            result_queue.put({"ann_obj_id": ann_obj_id, "max_obj_id": max_obj_id})
+        elif task["command"] == "update_current_id":
+            ann_obj_id = task["ann_obj_id"]
+            ann_obj_id = update_current_id(ann_obj_id)
+            result_queue.put({"ann_obj_id": ann_obj_id})
+        elif task["command"] == "drawing_board_get_input_first_frame":
+            input_first_frame = task["input_first_frame"]
+            input_first_frame = drawing_board_get_input_first_frame(input_first_frame)
+            result_queue.put({"input_first_frame": input_first_frame})
+        elif task["command"] == "reset":
+            seg_tracker, click_stack, input_first_frame, drawing_board, frame_per, output_video, output_mp4, output_mask, ann_obj_id, max_obj_id = reset(seg_tracker)
+            result_queue.put({"click_stack": click_stack, "input_first_frame": input_first_frame, "drawing_board": drawing_board, "frame_per": frame_per, "output_video": output_video, "output_mp4": output_mp4, "output_mask": output_mask, "ann_obj_id": ann_obj_id, "max_obj_id": max_obj_id})
+        elif task["command"] == "show_res_by_slider":
+            frame_per = task["frame_per"]
+            click_stack = task["click_stack"]
+            input_first_frame, drawing_board, frame_num = show_res_by_slider(session_id, frame_per, click_stack)
+            result_queue.put({"input_first_frame": input_first_frame, "drawing_board": drawing_board, "frame_num": frame_num})
+        elif task["command"] == "tracking_objects":
+            frame_num = task["frame_num"]
+            input_video = task["input_video"]
+            input_first_frame, drawing_board, output_video, output_mp4, output_mask, click_stack = tracking_objects(session_id, seg_tracker, frame_num, input_video)
+            result_queue.put({"input_first_frame": input_first_frame, "drawing_board": drawing_board, "output_video": output_video, "output_mp4": output_mp4, "output_mask": output_mask, "click_stack": click_stack})
+        else:
+            print(f"Unknown command {task['command']} for {session_id}")
+            result_queue.put("Unknown command")
+def start_process(session_id):
+    if session_id not in user_processes:
+        queue = mp.Queue()
+        result_queue = mp.Queue()
+        process = mp.Process(target=process_video, args=(queue, result_queue, session_id))
+        process.start()
+        user_processes[session_id] = {
+            "process": process,
+            "queue": queue,
+            "result_queue": result_queue,
+            "last_active": datetime.datetime.now()
+        }
+    else:
+        user_processes[session_id]["last_active"] = datetime.datetime.now()
+    return user_processes[session_id]["queue"]
+# def clean_up_processes(session_id, init_clean = False):
+#     now = datetime.datetime.now()
+#     to_remove = []
+#     for s_id, process_info in user_processes.items():
+#         if (now - process_info["last_active"] > PROCESS_TIMEOUT) or (s_id == session_id and init_clean):
+#             process_info["queue"].put({"command": "exit"})
+#             process_info["process"].terminate()
+#             process_info["process"].join()
+#             to_remove.append(s_id)
+#     for s_id in to_remove:
+#         del user_processes[s_id]
+#         print(f"Cleaned up process for session {s_id}.")
+def monitor_and_cleanup_processes():
+    while True:
+        now = datetime.datetime.now()
+        to_remove = []
+        for session_id, process_info in user_processes.items():
+            if now - process_info["last_active"] > PROCESS_TIMEOUT:
+                process_info["queue"].put({"command": "exit"})
+                process_info["process"].terminate()
+                process_info["process"].join()
+                to_remove.append(session_id)
+        for session_id in to_remove:
+            del user_processes[session_id]
+            print(f"Automatically cleaned up process for session {session_id}.")
+        time.sleep(10)
+def seg_track_app():
+    # Only supports gradio==3.38.0
+    import gradio as gr
+    def extract_session_id_from_request(request: gr.Request):
+        session_id = hashlib.sha256(f'{request.client.host}:{request.client.port}'.encode('utf-8')).hexdigest()
+        # cookies = request.kwargs["headers"].get('cookie', '')
+        # session_id = None
+        # if '_gid=' in cookies:
+        #     session_id = cookies.split('_gid=')[1].split(';')[0]
+        # else:
+        #     session_id = str(uuid.uuid4())
+        print(f"session_id {session_id}")
+        return session_id
+    def handle_extract_video_info(session_id, input_video):
+        # clean_up_processes(session_id, init_clean=True)
+        if input_video == None:
+            return 0, 0, {
+            "minimum": 0.0,
+            "maximum": 100,
+            "step": 0.01,
+            "value": 0.0,
+        }, None, None, None, None, None
+        queue = start_process(session_id)
+        result_queue = user_processes[session_id]["result_queue"]
+        queue.put({"command": "extract_video_info", "input_video": input_video})
+        result = result_queue.get()
+        fps = result.get("fps")
+        total_frames = result.get("total_frames")
+        input_first_frame = result.get("input_first_frame")
+        drawing_board = result.get("drawing_board")
+        output_video = result.get("output_video")
+        output_mp4 = result.get("output_mp4")
+        output_mask = result.get("output_mask")
+        scale_slider = gr.Slider.update(minimum=1.0,
+                                    maximum=fps,
+                                    step=1.0,
+                                    value=fps,)
+        frame_per = gr.Slider.update(minimum= 0.0,
+                                maximum= total_frames / fps,
+                                step=1.0/fps,
+                                value=0.0,)
+        slider_state = {
+            "minimum": 0.0,
+            "maximum": total_frames / fps,
+            "step": 1.0/fps,
+            "value": 0.0,
+        }
+        return scale_slider, frame_per, slider_state, input_first_frame, drawing_board, output_video, output_mp4, output_mask
+    def handle_get_meta_from_video(session_id, input_video, scale_slider, selected_config, selected_checkpoint):
+        config_path = config_file_map[selected_config]
+        checkpoint_path = checkpoint_file_map[selected_checkpoint]
+        # clean_up_processes(session_id)
+        queue = start_process(session_id)
+        result_queue = user_processes[session_id]["result_queue"]
+        queue.put({"command": "get_meta_from_video", "input_video": input_video, "scale_slider": scale_slider, "config_path": config_path, "checkpoint_path": checkpoint_path})
+        result = result_queue.get()
+        input_first_frame = result.get("input_first_frame")
+        drawing_board = result.get("drawing_board")
+        (fps, frame_interval, total_frames) = result.get("frame_per")
+        output_video = result.get("output_video")
+        output_mp4 = result.get("output_mp4")
+        output_mask = result.get("output_mask")
+        ann_obj_id = result.get("ann_obj_id")
+        max_obj_id = result.get("max_obj_id")
+        frame_per = gr.Slider.update(minimum= 0.0,
+                                maximum= total_frames / fps,
+                                step=frame_interval / fps / 2,
+                                value=0.0,)
+        slider_state = {
+            "minimum": 0.0,
+            "maximum": total_frames / fps,
+            "step": frame_interval/fps / 2 ,
+            "value": 0.0,
+        }
+        obj_id_slider = gr.Slider.update(
+                                    maximum=max_obj_id,
+                                    value=ann_obj_id
+                                )
+        return input_first_frame, drawing_board, frame_per, slider_state, output_video, output_mp4, output_mask, ann_obj_id, max_obj_id, obj_id_slider
+    def handle_sam_stroke(session_id, drawing_board, last_draw, frame_num, ann_obj_id):
+        # clean_up_processes(session_id)
+        queue = start_process(session_id)
+        result_queue = user_processes[session_id]["result_queue"]
+        queue.put({"command": "sam_stroke", "drawing_board": drawing_board, "last_draw": last_draw, "frame_num": frame_num, "ann_obj_id": ann_obj_id})
+        result = result_queue.get()
+        input_first_frame = result.get("input_first_frame")
+        drawing_board = result.get("drawing_board")
+        last_draw = result.get("last_draw")
+        return input_first_frame, drawing_board, last_draw
+    def handle_sam_click(session_id, frame_num, point_mode, click_stack, ann_obj_id, evt: gr.SelectData):
+        # clean_up_processes(session_id)
+        queue = start_process(session_id)
+        result_queue = user_processes[session_id]["result_queue"]
+        point = np.array([[evt.index[0], evt.index[1]]], dtype=np.float32)
+        queue.put({"command": "sam_click", "frame_num": frame_num, "point_mode": point_mode, "click_stack": click_stack, "ann_obj_id": ann_obj_id, "point": point})
+        result = result_queue.get()
+        input_first_frame = result.get("input_first_frame")
+        drawing_board = result.get("drawing_board")
+        last_draw = result.get("last_draw")
+        return input_first_frame, drawing_board, last_draw
+    def handle_increment_ann_obj_id(session_id, max_obj_id):
+        # clean_up_processes(session_id)
+        queue = start_process(session_id)
+        result_queue = user_processes[session_id]["result_queue"]
+        queue.put({"command": "increment_ann_obj_id", "max_obj_id": max_obj_id})
+        result = result_queue.get()
+        ann_obj_id = result.get("ann_obj_id")
+        max_obj_id = result.get("max_obj_id")
+        obj_id_slider = gr.Slider.update(maximum=max_obj_id, value=ann_obj_id)
+        return ann_obj_id, max_obj_id, obj_id_slider
+    def handle_update_current_id(session_id, ann_obj_id):
+        # clean_up_processes(session_id)
+        queue = start_process(session_id)
+        result_queue = user_processes[session_id]["result_queue"]
+        queue.put({"command": "update_current_id", "ann_obj_id": ann_obj_id})
+        result = result_queue.get()
+        ann_obj_id = result.get("ann_obj_id")
+        return ann_obj_id
+    def handle_drawing_board_get_input_first_frame(session_id, input_first_frame):
+        # clean_up_processes(session_id)
+        queue = start_process(session_id)
+        result_queue = user_processes[session_id]["result_queue"]
+        queue.put({"command": "drawing_board_get_input_first_frame", "input_first_frame": input_first_frame})
+        result = result_queue.get()
+        input_first_frame = result.get("input_first_frame")
+        return input_first_frame
+    def handle_reset(session_id):
+        # clean_up_processes(session_id)
+        queue = start_process(session_id)
+        result_queue = user_processes[session_id]["result_queue"]
+        queue.put({"command": "reset"})
+        result = result_queue.get()
+        click_stack = result.get("click_stack")
+        input_first_frame = result.get("input_first_frame")
+        drawing_board = result.get("drawing_board")
+        slider_state = {
+            "minimum": 0.0,
+            "maximum": 100,
+            "step": 0.01,
+            "value": 0.0,
+        }
+        output_video = result.get("output_video")
+        output_mp4 = result.get("output_mp4")
+        output_mask = result.get("output_mask")
+        ann_obj_id = result.get("ann_obj_id")
+        max_obj_id = result.get("max_obj_id")
+        obj_id_slider = gr.Slider.update(
+                            maximum=max_obj_id,
+                            value=ann_obj_id)
+        return click_stack, input_first_frame, drawing_board, frame_per, slider_state, output_video, output_mp4, output_mask, ann_obj_id, max_obj_id, obj_id_slider
+    def handle_show_res_by_slider(session_id, frame_per, slider_state, click_stack):
+        # clean_up_processes(session_id)
+        queue = start_process(session_id)
+        frame_per = frame_per/slider_state["maximum"]
+        result_queue = user_processes[session_id]["result_queue"]
+        queue.put({"command": "show_res_by_slider", "frame_per": frame_per, "click_stack": click_stack})
+        result = result_queue.get()
+        input_first_frame = result.get("input_first_frame")
+        drawing_board = result.get("drawing_board")
+        frame_num = result.get("frame_num")
+        return input_first_frame, drawing_board, frame_num
+    def handle_tracking_objects(session_id, frame_num, input_video):
+        # clean_up_processes(session_id)
+        queue = start_process(session_id)
+        result_queue = user_processes[session_id]["result_queue"]
+        queue.put({"command": "tracking_objects", "frame_num": frame_num, "input_video": input_video})
+        result = result_queue.get()
+        input_first_frame = result.get("input_first_frame")
+        drawing_board = result.get("drawing_board")
+        output_video = result.get("output_video")
+        output_mp4 = result.get("output_mp4")
+        output_mask = result.get("output_mask")
+        click_stack = result.get("click_stack")
+        return input_first_frame, drawing_board, output_video, output_mp4, output_mask, click_stack
+    ##########################################################
+    ######################  Front-end ########################
+    ##########################################################
+    css = """
+    #input_output_video video {
+        max-height: 550px;
+        max-width: 100%;
+        height: auto;
+    }
+    """
+    if platform.system() == "Windows":
+        config_path = os.path.abspath(os.environ.get("CONFIG_PATH", "sam2/configs/"))
+        checkpoint_path = os.environ.get("CHECKPOINT_PATH", "checkpoints/")
+        config_files = glob(os.path.join(config_path, "*.yaml"))
+        config_files.sort(key=lambda x: '_t.' not in basename(x))
+        checkpoint_files = glob(os.path.join(checkpoint_path, "*.pt"))
+        checkpoint_files.sort(key=lambda x: 'tiny' not in basename(x))
+        medsam_checkpoints = glob("checkpoints/*.pt")
+    else:
+        config_path = "/" + os.path.abspath(os.environ.get("CONFIG_PATH", "./sam2/configs/"))
+        checkpoint_path = os.environ.get("CHECKPOINT_PATH", "./checkpoints")
+        config_files = glob(os.path.join(config_path, "*.yaml"))
+        config_files.sort(key=lambda x: '_t.' not in basename(x))
+        checkpoint_files = glob(os.path.join(checkpoint_path, "*.pt"))
+        checkpoint_files.sort(key=lambda x: 'tiny' not in basename(x))
+        medsam_checkpoints = glob("./checkpoints/*.pt")
+    config_display = [splitext(basename(f))[0] for f in config_files]
+    medsam_display = [
+        f"{os.path.basename(dirname(dirname(path)))} / {splitext(basename(path))[0]}"
+        for path in medsam_checkpoints
+    ]
+    checkpoint_display = [
+        splitext(basename(f))[0] for f in checkpoint_files
+    ] + medsam_display
+    checkpoint_files.extend(medsam_checkpoints)
+    config_file_map = dict(zip(config_display, config_files))
+    checkpoint_file_map = dict(zip(checkpoint_display, checkpoint_files))
+    app = gr.Blocks(css=css)
+    with app:
+        session_id = gr.State()
+        app.load(extract_session_id_from_request, None, session_id)
+        gr.Markdown(
+            '''
+            <div style="text-align:center; margin-bottom:20px;">
+                <span style="font-size:3em; font-weight:bold;">MedSAM2: Segment Anything in 3D Medical Images and Videos</span>
+            </div>
+            <div style="text-align:center; margin-bottom:20px;">
+                <a href="https://github.com/bowang-lab/MedSAM/tree/MedSAM2">
+                    <img src="https://badges.aleen42.com/src/github.svg" alt="GitHub" style="display:inline-block; margin-right:10px;">
+                </a>
+                <a href="https://arxiv.org/abs/2408.03322">
+                    <img src="https://img.shields.io/badge/arXiv-2408.03322-green?style=plastic" alt="Paper" style="display:inline-block; margin-right:10px;">
+                </a>
+                <a href="https://github.com/bowang-lab/MedSAMSlicer/tree/MedSAM2">
+                    <img src="https://img.shields.io/badge/3D-Slicer-Plugin" alt="3D Slicer Plugin" style="display:inline-block; margin-right:10px;">
+                </a>
+            </div>
+            <div style="text-align:left; margin-bottom:20px;">
+                This API supports using box (generated by scribble) and point prompts for medical video segmentation.
+            </div>
+            <div style="margin-bottom:20px;">
+                <ol style="list-style:none; padding-left:0;">
+                    <li>1. Upload video file</li>
+                    <li>2. Select model size and downsample frame rate and run <b>Preprocess</b></li>
+                    <li>3. Use <b>Stroke to Box Prompt</b> to draw box on the first frame or <b>Point Prompt</b> to click on the first frame.</li>
+                    <li>&nbsp;&nbsp;&nbsp;Note: The bounding rectangle of the stroke should be able to cover the segmentation target.</li>
+                    <li>4. Click <b>Segment</b> to get the segmentation result</li>
+                    <li>5. Click <b>Add New Object</b> to add new object</li>
+                    <li>6. Click <b>Start Tracking</b> to track objects in the video</li>
+                    <li>7. Click <b>Reset</b> to reset the app</li>
+                    <li>8. Download the video with segmentation results</li>
+                </ol>
+            </div>
+            <div style="text-align:left; line-height:1.8;">
+                If you find these tools useful, please consider citing the following papers:
+            </div>
+            <div style="text-align:left; line-height:1.8;">
+                Ravi, N., Gabeur, V., Hu, Y.T., Hu, R., Ryali, C., Ma, T., Khedr, H., Rädle, R., Rolland, C., Gustafson, L., Mintun, E., Pan, J., Alwala, K.V., Carion, N., Wu, C.Y., Girshick, R., Dollár, P., Feichtenhofer, C.: SAM 2: Segment Anything in Images and Videos. ICLR 2025
+            </div>
+            <div style="text-align:left; line-height:1.8;">
+                Ma, J.*, Yang, Z.*, Kim, S., Chen, B., Baharoon, M., Fallahpour, A, Asakereh, R., Lyu, H., Wang, B.: MedSAM2: Segment Anything in Medical Images and Videos. arXiv preprint (2025)
+            </div>
+            '''
+        )
+        click_stack = gr.State(({}, {}))
+        frame_num = gr.State(value=(int(0)))
+        ann_obj_id = gr.State(value=(int(0)))
+        max_obj_id = gr.State(value=(int(0)))
+        last_draw = gr.State(None)
+        slider_state = gr.State(value={
+            "minimum": 0.0,
+            "maximum": 100,
+            "step": 0.01,
+            "value": 0.0,
+        })
+        with gr.Row():
+            with gr.Column(scale=0.5):
+                with gr.Row():
+                    tab_video_input = gr.Tab(label="Video input")
+                    with tab_video_input:
+                        input_video = gr.Video(label='Input video', type=["mp4", "mov", "avi"], elem_id="input_output_video")
+                        with gr.Row():
+                            # checkpoint = gr.Dropdown(label="Model Size", choices=["tiny", "small", "base-plus", "large"], value="tiny")
+                            config_dropdown = gr.Dropdown(
+                                choices=config_display,
+                                value=config_display[0],
+                                label="Select Config File"
+                            )
+                            checkpoint_dropdown = gr.Dropdown(
+                                choices=checkpoint_display,
+                                value=checkpoint_display[0],
+                                label="Select Checkpoint File"
+                            )
+                            scale_slider = gr.Slider(
+                                label="Downsampe Frame Rate (fps)",
+                                minimum=0.0,
+                                maximum=1.0,
+                                step=0.25,
+                                value=1.0,
+                                interactive=True
+                            )
+                            preprocess_button = gr.Button(
+                                value="Preprocess",
+                                interactive=True,
+                            )
+                with gr.Row():
+                    tab_stroke = gr.Tab(label="Stroke to Box Prompt")
+                    with tab_stroke:
+                        drawing_board = gr.Image(label='Drawing Board', tool="sketch", brush_radius=10, interactive=True)
+                        with gr.Row():
+                            seg_acc_stroke = gr.Button(value="Segment", interactive=True)
+                    tab_click = gr.Tab(label="Point Prompt")
+                    with tab_click:
+                        input_first_frame = gr.Image(label='Segment result of first frame',interactive=True).style(height=550)
+                        with gr.Row():
+                            point_mode = gr.Radio(
+                                        choices=["Positive",  "Negative"],
+                                        value="Positive",
+                                        label="Point Prompt",
+                                        interactive=True)
+                with gr.Row():
+                    with gr.Column():
+                        frame_per = gr.Slider(
+                            label = "Time (seconds)",
+                            minimum= 0.0,
+                            maximum= 100.0,
+                            step=0.01,
+                            value=0.0,
+                        )
+                        with gr.Row():
+                            with gr.Column():
+                                obj_id_slider = gr.Slider(
+                                    minimum=0,
+                                    maximum=0,
+                                    step=1,
+                                    interactive=True,
+                                    label="Current Object ID"
+                                )
+                            with gr.Column():
+                                new_object_button = gr.Button(
+                                    value="Add New Object",
+                                    interactive=True
+                                )
+                        track_for_video = gr.Button(
+                            value="Start Tracking",
+                                interactive=True,
+                                )
+                        reset_button = gr.Button(
+                            value="Reset",
+                            interactive=True, visible=False,
+                        )
+            with gr.Column(scale=0.5):
+                output_video = gr.Video(label='Visualize Results', elem_id="input_output_video")
+                output_mp4 = gr.File(label="Predicted video")
+                output_mask = gr.File(label="Predicted masks")
+        gr.Markdown(
+            '''
+            <div style="text-align:center; margin-top: 20px;">
+                The authors of this work highly appreciate Meta AI for making SAM2 publicly available to the community.
+                The interface was built on <a href="https://github.com/z-x-yang/Segment-and-Track-Anything/blob/main/tutorial/tutorial%20for%20WebUI-1.0-Version.md" target="_blank">SegTracker</a>, which is also an amazing tool for video segmentation tracking.
+                <a href="https://docs.google.com/document/d/1idDBV0faOjdjVs-iAHr0uSrw_9_ZzLGrUI2FEdK-lso/edit?usp=sharing" target="_blank">Data source</a>
+            </div>
+                '''
+        )
+    ##########################################################
+    ######################  back-end #########################
+    ##########################################################
+        # listen to the preprocess button click to get the first frame of video with scaling
+        preprocess_button.click(
+            fn=handle_get_meta_from_video,
+            inputs=[
+                session_id,
+                input_video,
+                scale_slider,
+                config_dropdown,
+                checkpoint_dropdown
+            ],
+            outputs=[
+                input_first_frame, drawing_board, frame_per, slider_state, output_video, output_mp4, output_mask, ann_obj_id, max_obj_id, obj_id_slider
+            ], queue=False, every=15
+        )
+        frame_per.release(
+            fn=handle_show_res_by_slider,
+            inputs=[
+                session_id, frame_per, slider_state, click_stack
+                ],
+            outputs=[
+                input_first_frame, drawing_board, frame_num
+            ]
+        )
+        # Interactively modify the mask acc click
+        input_first_frame.select(
+            fn=handle_sam_click,
+            inputs=[
+                session_id, frame_num, point_mode, click_stack, ann_obj_id
+            ],
+            outputs=[
+                input_first_frame, drawing_board, click_stack
+            ]
+        )
+        # Track object in video
+        track_for_video.click(
+            fn=handle_tracking_objects,
+            inputs=[
+                session_id,
+                frame_num,
+                input_video,
+            ],
+            outputs=[
+                input_first_frame,
+                drawing_board,
+                output_video,
+                output_mp4,
+                output_mask,
+                click_stack
+            ], queue=False, every=15
+        )
+        reset_button.click(
+            fn=handle_reset,
+            inputs=[session_id],
+            outputs=[
+                click_stack, input_first_frame, drawing_board, frame_per, slider_state, output_video, output_mp4, output_mask, ann_obj_id, max_obj_id, obj_id_slider
+            ]
+        )
+        new_object_button.click(
+            fn=handle_increment_ann_obj_id,
+            inputs=[ session_id, max_obj_id ],
+            outputs=[ ann_obj_id, max_obj_id, obj_id_slider ]
+        )
+        obj_id_slider.change(
+            fn=handle_update_current_id,
+            inputs=[session_id, obj_id_slider],
+            outputs={ann_obj_id}
+        )
+        tab_stroke.select(
+            fn=handle_drawing_board_get_input_first_frame,
+            inputs=[session_id, input_first_frame],
+            outputs=[drawing_board,],
+        )
+        seg_acc_stroke.click(
+            fn=handle_sam_stroke,
+            inputs=[
+                session_id, drawing_board, last_draw, frame_num, ann_obj_id
+            ],
+            outputs=[
+                input_first_frame, drawing_board, last_draw
+            ]
+        )
+        input_video.change(
+            fn=handle_extract_video_info,
+            inputs=[session_id, input_video],
+            outputs=[scale_slider, frame_per, slider_state, input_first_frame, drawing_board, output_video, output_mp4, output_mask], queue=False, every=15
+        )
+    app.queue(concurrency_count=1)
+    app.launch(debug=True, enable_queue=True, share=False, server_name="0.0.0.0", server_port=18862)
+    # app.launch(debug=True, enable_queue=True, share=True)
+if __name__ == "__main__":
+    mp.set_start_method("spawn")
+    monitor_thread = threading.Thread(target=monitor_and_cleanup_processes)
+    monitor_thread.daemon = True
+    monitor_thread.start()
+    seg_track_app()

checkpoints/MedSAM2_2411.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcd946a4d934f553236866fc7e8af77f7e931430e9c044f4ac9d6a723630a870
+size 156039179

checkpoints/MedSAM2_CTLesion.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78f7e125418dfd6fec22f4afe90bcd85cb1d4423d0a9df36f7a87ed63aa1a5f5
+size 156041079

checkpoints/MedSAM2_MRI_LiverLesion.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3632fc77def3a136d7ae8d734613472d187a803b4a1846370b45419622072b2b
+size 156044532

checkpoints/MedSAM2_US_Heart.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:295c0ff8912c99947c364287bbecd1cd36963f0c0ac67a042d292f0dedf8d933
+size 156041079

checkpoints/MedSAM2_latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c92743b99f00d078bf32a3afcc38aaa9faf1c1692dffe3eaa7a90938c1991060
+size 156040129

checkpoints/README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+Download checkpoints `sh download.sh`
+- `MedSAM2_2411.pt`: The based model trained in Nov. 2024
+- `MedSAM2_US_Heart.pt`: Fine-tuned model for heart ultrasound video segmentation
+- `MedSAM2_MRI_LiverLesion.pt`: Fine-tuned model for liver lesion MRI segmentation
+- `MedSAM2_CTLesion.pt`: Fine-tuned model for CT lesion segmentation
+- `MedSAM2_latest.pt` (recommended): Latest model trained on the combination of existing public datasets and newly annotated datasets

download.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/sh
+# Script to download MedSAM2 model checkpoints
+# Create checkpoints directory if it doesn't exist
+mkdir -p checkpoints
+# Use either wget or curl to download the checkpoints
+if command -v wget > /dev/null 2>&1; then
+    CMD="wget -P checkpoints"
+elif command -v curl > /dev/null 2>&1; then
+    CMD="curl -L -o"
+    CURL=1
+else
+    echo "Please install wget or curl to download the checkpoints."
+    exit 1
+fi
+# Define the base URL for MedSAM2 models on Hugging Face
+HF_BASE_URL="https://huggingface.co/wanglab/MedSAM2/resolve/main"
+# Define the model checkpoint files (as separate variables instead of an array)
+MODEL1="MedSAM2_2411.pt"
+MODEL2="MedSAM2_US_Heart.pt"
+MODEL3="MedSAM2_MRI_LiverLesion.pt"
+MODEL4="MedSAM2_CTLesion.pt"
+MODEL5="MedSAM2_latest.pt"
+# Download each checkpoint
+for model in $MODEL1 $MODEL2 $MODEL3 $MODEL4 $MODEL5; do
+    echo "Downloading ${model}..."
+    model_url="${HF_BASE_URL}/${model}"
+    if [ -n "$CURL" ]; then
+        $CMD "checkpoints/${model}" "$model_url" || { echo "Failed to download checkpoint from $model_url"; exit 1; }
+    else
+        $CMD "$model_url" || { echo "Failed to download checkpoint from $model_url"; exit 1; }
+    fi
+done
+echo "All MedSAM2 model checkpoints have been downloaded successfully to the 'checkpoints' directory."

download_checkpoints.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Run this as a Python script in the terminal (or via Python shell)
+from huggingface_hub import hf_hub_download
+import os
+os.makedirs("checkpoints", exist_ok=True)
+model_files = [
+    "MedSAM2_2411.pt",
+    "MedSAM2_US_Heart.pt",
+    "MedSAM2_MRI_LiverLesion.pt",
+    "MedSAM2_CTLesion.pt",
+    "MedSAM2_latest.pt"
+]
+for filename in model_files:
+    hf_hub_download(
+        repo_id="wanglab/MedSAM2",
+        filename=filename,
+        local_dir="checkpoints",
+        local_dir_use_symlinks=False
+    )

gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+.vscode/
+.DS_Store
+__pycache__/
+*-checkpoint.ipynb
+.venv
+*.egg*
+build/*
+_C.*
+*.nii.gz
+*.csv
+outputs/*
+checkpoints/*.pt
+*.pt

medsam2_infer_3D_CT.py ADDED Viewed

	@@ -0,0 +1,304 @@

+from glob import glob
+from tqdm import tqdm
+import os
+from os.path import join, basename
+import re
+import matplotlib.pyplot as plt
+from collections import OrderedDict
+import pandas as pd
+import numpy as np
+import argparse
+from PIL import Image
+import SimpleITK as sitk
+import torch
+import torch.multiprocessing as mp
+from sam2.build_sam import build_sam2_video_predictor_npz
+import SimpleITK as sitk
+from skimage import measure, morphology
+torch.set_float32_matmul_precision('high')
+torch.manual_seed(2024)
+torch.cuda.manual_seed(2024)
+np.random.seed(2024)
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--checkpoint',
+    type=str,
+    default="checkpoints/MedSAM2_latest.pt",
+    help='checkpoint path',
+)
+parser.add_argument(
+    '--cfg',
+    type=str,
+    default="configs/sam2.1_hiera_t512.yaml",
+    help='model config',
+)
+parser.add_argument(
+    '-i',
+    '--imgs_path',
+    type=str,
+    default="CT_DeepLesion/images",
+    help='imgs path',
+)
+parser.add_argument(
+    '--gts_path',
+    default=None,
+    help='simulate prompts based on ground truth',
+)
+parser.add_argument(
+    '-o',
+    '--pred_save_dir',
+    type=str,
+    default="./DeeLesion_results",
+    help='path to save segmentation results',
+)
+# add option to propagate with either box or mask
+parser.add_argument(
+    '--propagate_with_box',
+    default=True,
+    action='store_true',
+    help='whether to propagate with box'
+)
+args = parser.parse_args()
+checkpoint = args.checkpoint
+model_cfg = args.cfg
+imgs_path = args.imgs_path
+gts_path = args.gts_path
+pred_save_dir = args.pred_save_dir
+os.makedirs(pred_save_dir, exist_ok=True)
+propagate_with_box = args.propagate_with_box
+def getLargestCC(segmentation):
+    labels = measure.label(segmentation)
+    largestCC = labels == np.argmax(np.bincount(labels.flat)[1:])+1
+    return largestCC
+def dice_multi_class(preds, targets):
+    smooth = 1.0
+    assert preds.shape == targets.shape
+    labels = np.unique(targets)[1:]
+    dices = []
+    for label in labels:
+        pred = preds == label
+        target = targets == label
+        intersection = (pred * target).sum()
+        dices.append((2.0 * intersection + smooth) / (pred.sum() + target.sum() + smooth))
+    return np.mean(dices)
+def show_mask(mask, ax, mask_color=None, alpha=0.5):
+    """
+    show mask on the image
+    Parameters
+    ----------
+    mask : numpy.ndarray
+        mask of the image
+    ax : matplotlib.axes.Axes
+        axes to plot the mask
+    mask_color : numpy.ndarray
+        color of the mask
+    alpha : float
+        transparency of the mask
+    """
+    if mask_color is not None:
+        color = np.concatenate([mask_color, np.array([alpha])], axis=0)
+    else:
+        color = np.array([251/255, 252/255, 30/255, alpha])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_box(box, ax, edgecolor='blue'):
+    """
+    show bounding box on the image
+    Parameters
+    ----------
+    box : numpy.ndarray
+        bounding box coordinates in the original image
+    ax : matplotlib.axes.Axes
+        axes to plot the bounding box
+    edgecolor : str
+        color of the bounding box
+    """
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor=edgecolor, facecolor=(0,0,0,0), lw=2))
+def resize_grayscale_to_rgb_and_resize(array, image_size):
+    """
+    Resize a 3D grayscale NumPy array to an RGB image and then resize it.
+    Parameters:
+        array (np.ndarray): Input array of shape (d, h, w).
+        image_size (int): Desired size for the width and height.
+    Returns:
+        np.ndarray: Resized array of shape (d, 3, image_size, image_size).
+    """
+    d, h, w = array.shape
+    resized_array = np.zeros((d, 3, image_size, image_size))
+    for i in range(d):
+        img_pil = Image.fromarray(array[i].astype(np.uint8))
+        img_rgb = img_pil.convert("RGB")
+        img_resized = img_rgb.resize((image_size, image_size))
+        img_array = np.array(img_resized).transpose(2, 0, 1)  # (3, image_size, image_size)
+        resized_array[i] = img_array
+    return resized_array
+def mask2D_to_bbox(gt2D, max_shift=20):
+    y_indices, x_indices = np.where(gt2D > 0)
+    x_min, x_max = np.min(x_indices), np.max(x_indices)
+    y_min, y_max = np.min(y_indices), np.max(y_indices)
+    H, W = gt2D.shape
+    bbox_shift = np.random.randint(0, max_shift + 1, 1)[0]
+    x_min = max(0, x_min - bbox_shift)
+    x_max = min(W-1, x_max + bbox_shift)
+    y_min = max(0, y_min - bbox_shift)
+    y_max = min(H-1, y_max + bbox_shift)
+    boxes = np.array([x_min, y_min, x_max, y_max])
+    return boxes
+def mask3D_to_bbox(gt3D, max_shift=20):
+    z_indices, y_indices, x_indices = np.where(gt3D > 0)
+    x_min, x_max = np.min(x_indices), np.max(x_indices)
+    y_min, y_max = np.min(y_indices), np.max(y_indices)
+    z_min, z_max = np.min(z_indices), np.max(z_indices)
+    D, H, W = gt3D.shape
+    bbox_shift = np.random.randint(0, max_shift + 1, 1)[0]
+    x_min = max(0, x_min - bbox_shift)
+    x_max = min(W-1, x_max + bbox_shift)
+    y_min = max(0, y_min - bbox_shift)
+    y_max = min(H-1, y_max + bbox_shift)
+    z_min = max(0, z_min)
+    z_max = min(D-1, z_max)
+    boxes3d = np.array([x_min, y_min, z_min, x_max, y_max, z_max])
+    return boxes3d
+DL_info = pd.read_csv('CT_DeepLesion/DeepLesion_Dataset_Info.csv')
+nii_fnames = sorted(os.listdir(imgs_path))
+nii_fnames = [i for i in nii_fnames if i.endswith('.nii.gz')]
+nii_fnames = [i for i in nii_fnames if not i.startswith('._')]
+print(f'Processing {len(nii_fnames)} nii files')
+seg_info = OrderedDict()
+seg_info['nii_name'] = []
+seg_info['key_slice_index'] = []
+seg_info['DICOM_windows'] = []
+# initialized predictor
+predictor = build_sam2_video_predictor_npz(model_cfg, checkpoint)
+for nii_fname in tqdm(nii_fnames):
+    # get corresponding case info
+    range_suffix = re.findall(r'\d{3}-\d{3}', nii_fname)[0]
+    slice_range = range_suffix.split('-')
+    slice_range = [str(int(s)) for s in slice_range]
+    slice_range = ', '.join(slice_range)
+    nii_image = sitk.ReadImage(join(imgs_path, nii_fname))
+    nii_image_data = sitk.GetArrayFromImage(nii_image)
+    case_name = re.findall(r'^(\d{6}_\d{2}_\d{2})', nii_fname)[0]
+    case_df = DL_info[
+        DL_info['File_name'].str.contains(case_name) &
+        DL_info['Slice_range'].str.contains(slice_range)
+    ].copy()
+    segs_3D = np.zeros(nii_image_data.shape, dtype=np.uint8)
+    for row_id, row in case_df.iterrows():
+        # print(f'Processing {case_name} tumor {tumor_idx}')
+        # get the key slice info
+        lower_bound, upper_bound = row['DICOM_windows'].split(',')
+        lower_bound, upper_bound = float(lower_bound), float(upper_bound)
+        nii_image_data_pre = np.clip(nii_image_data, lower_bound, upper_bound)
+        nii_image_data_pre = (nii_image_data_pre - np.min(nii_image_data_pre))/(np.max(nii_image_data_pre)-np.min(nii_image_data_pre))*255.0
+        nii_image_data_pre = np.uint8(nii_image_data_pre)
+        key_slice_idx = row['Key_slice_index']
+        key_slice_idx = int(key_slice_idx)
+        slice_range = row['Slice_range']
+        slice_idx_start, slice_idx_end = slice_range.split(',')
+        slice_idx_start, slice_idx_end = int(slice_idx_start), int(slice_idx_end)
+        bbox_coords = row['Bounding_boxes']
+        bbox_coords = bbox_coords.split(',')
+        bbox_coords = [int(float(coord)) for coord in bbox_coords]
+        #bbox_coords = expand_box(bbox_coords)
+        bbox = np.array(bbox_coords) # y_min, x_min, y_max, x_max
+        bbox = np.array([bbox[1], bbox[0], bbox[3], bbox[2]])
+        key_slice_idx_offset = key_slice_idx - slice_idx_start
+        key_slice_img = nii_image_data_pre[key_slice_idx_offset, :,:]
+        img_3D_ori = nii_image_data_pre
+        assert np.max(img_3D_ori) < 256, f'input data should be in range [0, 255], but got {np.unique(img_3D_ori)}'
+        video_height = key_slice_img.shape[0]
+        video_width = key_slice_img.shape[1]
+        img_resized = resize_grayscale_to_rgb_and_resize(img_3D_ori, 512)
+        img_resized = img_resized / 255.0
+        img_resized = torch.from_numpy(img_resized).cuda()
+        img_mean=(0.485, 0.456, 0.406)
+        img_std=(0.229, 0.224, 0.225)
+        img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None].cuda()
+        img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None].cuda()
+        img_resized -= img_mean
+        img_resized /= img_std
+        z_mids = []
+        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+            inference_state = predictor.init_state(img_resized, video_height, video_width)
+            if propagate_with_box:
+                _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(
+                                                    inference_state=inference_state,
+                                                    frame_idx=key_slice_idx_offset,
+                                                    obj_id=1,
+                                                    box=bbox,
+                                                )
+            else: # gt
+                pass
+            for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
+                segs_3D[out_frame_idx, (out_mask_logits[0] > 0.0).cpu().numpy()[0]] = 1
+            predictor.reset_state(inference_state)
+            if propagate_with_box:
+                _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(
+                                                    inference_state=inference_state,
+                                                    frame_idx=key_slice_idx_offset,
+                                                    obj_id=1,
+                                                    box=bbox,
+                                                )
+            else: # gt
+                pass
+            for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state, reverse=True):
+                segs_3D[out_frame_idx, (out_mask_logits[0] > 0.0).cpu().numpy()[0]] = 1
+            predictor.reset_state(inference_state)
+        if np.max(segs_3D) > 0:
+            segs_3D = getLargestCC(segs_3D)
+            segs_3D = np.uint8(segs_3D)
+        sitk_image = sitk.GetImageFromArray(img_3D_ori)
+        sitk_image.CopyInformation(nii_image)
+        sitk_mask = sitk.GetImageFromArray(segs_3D)
+        sitk_mask.CopyInformation(nii_image)
+        # save single lesion
+        key_slice_idx = row['Key_slice_index']
+        save_seg_name = nii_fname.split('.nii.gz')[0] + f'_k{key_slice_idx}_mask.nii.gz'
+        sitk.WriteImage(sitk_image, os.path.join(pred_save_dir, nii_fname.replace('.nii.gz', '_img.nii.gz')))
+        sitk.WriteImage(sitk_mask, os.path.join(pred_save_dir, save_seg_name))
+        seg_info['nii_name'].append(save_seg_name)
+        seg_info['key_slice_index'].append(key_slice_idx)
+        seg_info['DICOM_windows'].append(row['DICOM_windows'])
+    seg_info_df = pd.DataFrame(seg_info)
+    seg_info_df.to_csv(join(pred_save_dir, 'tiny_seg_info202412.csv'), index=False)

medsam2_infer_video.py ADDED Viewed

	@@ -0,0 +1,570 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+from collections import defaultdict
+import numpy as np
+import torch
+from PIL import Image
+from sam2.build_sam import build_sam2_video_predictor
+# the PNG palette for DAVIS 2017 dataset
+DAVIS_PALETTE = b"\x00\x00\x00\x80\x00\x00\x00\x80\x00\x80\x80\x00\x00\x00\x80\x80\x00\x80\x00\x80\x80\x80\x80\x80@\x00\x00\xc0\x00\x00@\x80\x00\xc0\x80\x00@\x00\x80\xc0\x00\x80@\x80\x80\xc0\x80\x80\x00@\x00\x80@\x00\x00\xc0\x00\x80\xc0\x00\x00@\x80\x80@\x80\x00\xc0\x80\x80\xc0\x80@@\x00\xc0@\x00@\xc0\x00\xc0\xc0\x00@@\x80\xc0@\x80@\xc0\x80\xc0\xc0\x80\x00\x00@\x80\x00@\x00\x80@\x80\x80@\x00\x00\xc0\x80\x00\xc0\x00\x80\xc0\x80\x80\xc0@\x00@\xc0\x00@@\x80@\xc0\x80@@\x00\xc0\xc0\x00\xc0@\x80\xc0\xc0\x80\xc0\x00@@\x80@@\x00\xc0@\x80\xc0@\x00@\xc0\x80@\xc0\x00\xc0\xc0\x80\xc0\xc0@@@\xc0@@@\xc0@\xc0\xc0@@@\xc0\xc0@\xc0@\xc0\xc0\xc0\xc0\xc0 \x00\x00\xa0\x00\x00 \x80\x00\xa0\x80\x00 \x00\x80\xa0\x00\x80 \x80\x80\xa0\x80\x80`\x00\x00\xe0\x00\x00`\x80\x00\xe0\x80\x00`\x00\x80\xe0\x00\x80`\x80\x80\xe0\x80\x80 @\x00\xa0@\x00 \xc0\x00\xa0\xc0\x00 @\x80\xa0@\x80 \xc0\x80\xa0\xc0\x80`@\x00\xe0@\x00`\xc0\x00\xe0\xc0\x00`@\x80\xe0@\x80`\xc0\x80\xe0\xc0\x80 \x00@\xa0\x00@ \x80@\xa0\x80@ \x00\xc0\xa0\x00\xc0 \x80\xc0\xa0\x80\xc0`\x00@\xe0\x00@`\x80@\xe0\x80@`\x00\xc0\xe0\x00\xc0`\x80\xc0\xe0\x80\xc0 @@\xa0@@ \xc0@\xa0\xc0@ @\xc0\xa0@\xc0 \xc0\xc0\xa0\xc0\xc0`@@\xe0@@`\xc0@\xe0\xc0@`@\xc0\xe0@\xc0`\xc0\xc0\xe0\xc0\xc0\x00 \x00\x80 \x00\x00\xa0\x00\x80\xa0\x00\x00 \x80\x80 \x80\x00\xa0\x80\x80\xa0\x80@ \x00\xc0 \x00@\xa0\x00\xc0\xa0\x00@ \x80\xc0 \x80@\xa0\x80\xc0\xa0\x80\x00`\x00\x80`\x00\x00\xe0\x00\x80\xe0\x00\x00`\x80\x80`\x80\x00\xe0\x80\x80\xe0\x80@`\x00\xc0`\x00@\xe0\x00\xc0\xe0\x00@`\x80\xc0`\x80@\xe0\x80\xc0\xe0\x80\x00 @\x80 @\x00\xa0@\x80\xa0@\x00 \xc0\x80 \xc0\x00\xa0\xc0\x80\xa0\xc0@ @\xc0 @@\xa0@\xc0\xa0@@ \xc0\xc0 \xc0@\xa0\xc0\xc0\xa0\xc0\x00`@\x80`@\x00\xe0@\x80\xe0@\x00`\xc0\x80`\xc0\x00\xe0\xc0\x80\xe0\xc0@`@\xc0`@@\xe0@\xc0\xe0@@`\xc0\xc0`\xc0@\xe0\xc0\xc0\xe0\xc0  \x00\xa0 \x00 \xa0\x00\xa0\xa0\x00  \x80\xa0 \x80 \xa0\x80\xa0\xa0\x80` \x00\xe0 \x00`\xa0\x00\xe0\xa0\x00` \x80\xe0 \x80`\xa0\x80\xe0\xa0\x80 `\x00\xa0`\x00 \xe0\x00\xa0\xe0\x00 `\x80\xa0`\x80 \xe0\x80\xa0\xe0\x80``\x00\xe0`\x00`\xe0\x00\xe0\xe0\x00``\x80\xe0`\x80`\xe0\x80\xe0\xe0\x80  @\xa0 @ \xa0@\xa0\xa0@  \xc0\xa0 \xc0 \xa0\xc0\xa0\xa0\xc0` @\xe0 @`\xa0@\xe0\xa0@` \xc0\xe0 \xc0`\xa0\xc0\xe0\xa0\xc0 `@\xa0`@ \xe0@\xa0\xe0@ `\xc0\xa0`\xc0 \xe0\xc0\xa0\xe0\xc0``@\xe0`@`\xe0@\xe0\xe0@``\xc0\xe0`\xc0`\xe0\xc0\xe0\xe0\xc0"
+def load_ann_png(path):
+    """Load a PNG file as a mask and its palette."""
+    mask = Image.open(path)
+    palette = mask.getpalette()
+    mask = np.array(mask).astype(np.uint8)
+    return mask, palette
+def save_ann_png(path, mask, palette):
+    """Save a mask as a PNG file with the given palette."""
+    assert mask.dtype == np.uint8
+    assert mask.ndim == 2
+    output_mask = Image.fromarray(mask)
+    output_mask.putpalette(palette)
+    output_mask.save(path)
+def get_per_obj_mask(mask):
+    """Split a mask into per-object masks."""
+    object_ids = np.unique(mask)
+    object_ids = object_ids[object_ids > 0].tolist()
+    per_obj_mask = {object_id: (mask == object_id) for object_id in object_ids}
+    return per_obj_mask
+def put_per_obj_mask(per_obj_mask, height, width):
+    """Combine per-object masks into a single mask."""
+    mask = np.zeros((height, width), dtype=np.uint8)
+    object_ids = sorted(per_obj_mask)[::-1]
+    for object_id in object_ids:
+        object_mask = per_obj_mask[object_id]
+        object_mask = object_mask.reshape(height, width)
+        mask[object_mask] = object_id
+    return mask
+def load_masks_from_dir(
+    input_mask_dir, video_name, frame_name, per_obj_png_file, allow_missing=False
+):
+    """Load masks from a directory as a dict of per-object masks."""
+    if not per_obj_png_file:
+        input_mask_path = os.path.join(input_mask_dir, video_name, f"{frame_name}.png")
+        if allow_missing and not os.path.exists(input_mask_path):
+            return {}, None
+        input_mask, input_palette = load_ann_png(input_mask_path)
+        per_obj_input_mask = get_per_obj_mask(input_mask)
+    else:
+        per_obj_input_mask = {}
+        input_palette = None
+        # each object is a directory in "{object_id:%03d}" format
+        for object_name in os.listdir(os.path.join(input_mask_dir, video_name)):
+            object_id = int(object_name)
+            input_mask_path = os.path.join(
+                input_mask_dir, video_name, object_name, f"{frame_name}.png"
+            )
+            if allow_missing and not os.path.exists(input_mask_path):
+                continue
+            input_mask, input_palette = load_ann_png(input_mask_path)
+            per_obj_input_mask[object_id] = input_mask > 0
+    return per_obj_input_mask, input_palette
+def save_palette_masks_to_dir(
+    output_mask_dir,
+    video_name,
+    frame_name,
+    per_obj_output_mask,
+    height,
+    width,
+    per_obj_png_file,
+    output_palette,
+):
+    """Save masks to a directory as PNG files."""
+    os.makedirs(os.path.join(output_mask_dir, video_name), exist_ok=True)
+    if not per_obj_png_file:
+        output_mask = put_per_obj_mask(per_obj_output_mask, height, width)
+        output_mask_path = os.path.join(
+            output_mask_dir, video_name, f"{frame_name}.png"
+        )
+        save_ann_png(output_mask_path, output_mask, output_palette)
+    else:
+        for object_id, object_mask in per_obj_output_mask.items():
+            object_name = f"{object_id:03d}"
+            os.makedirs(
+                os.path.join(output_mask_dir, video_name, object_name),
+                exist_ok=True,
+            )
+            output_mask = object_mask.reshape(height, width).astype(np.uint8)
+            output_mask_path = os.path.join(
+                output_mask_dir, video_name, object_name, f"{frame_name}.png"
+            )
+            save_ann_png(output_mask_path, output_mask, output_palette)
+def save_masks_to_dir(
+    output_mask_dir,
+    video_name,
+    frame_name,
+    per_obj_output_mask,
+    height,
+    width,
+    per_obj_png_file,
+):
+    """Save masks to a directory as greyscale PNG files."""
+    os.makedirs(os.path.join(output_mask_dir, video_name), exist_ok=True)
+    if not per_obj_png_file:
+        output_mask = put_per_obj_mask(per_obj_output_mask, height, width)
+        output_mask_path = os.path.join(
+            output_mask_dir, video_name, f"{frame_name}.png"
+        )
+        assert output_mask.dtype == np.uint8
+        assert output_mask.ndim == 2
+        output_mask = Image.fromarray(output_mask)
+        output_mask.save(output_mask_path)
+    else:
+        for object_id, object_mask in per_obj_output_mask.items():
+            object_name = f"{object_id:03d}"
+            os.makedirs(
+                os.path.join(output_mask_dir, video_name, object_name),
+                exist_ok=True,
+            )
+            output_mask = object_mask.reshape(height, width).astype(np.uint8)
+            output_mask_path = os.path.join(
+                output_mask_dir, video_name, object_name, f"{frame_name}.png"
+            )
+            assert output_mask.dtype == np.uint8
+            assert output_mask.ndim == 2
+            output_mask = Image.fromarray(output_mask)
+            output_mask.save(output_mask_path)
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def vos_inference(
+    predictor,
+    base_video_dir,
+    input_mask_dir,
+    output_mask_dir,
+    video_name,
+    score_thresh=0.0,
+    use_all_masks=False,
+    per_obj_png_file=False,
+    save_palette_png=False,
+):
+    """Run inference on a single video with the given predictor."""
+    # load the video frames and initialize the inference state on this video
+    video_dir = os.path.join(base_video_dir, video_name)
+    frame_names = [
+        os.path.splitext(p)[0]
+        for p in os.listdir(video_dir)
+        if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
+    ]
+    frame_names = list(sorted(frame_names))
+    inference_state = predictor.init_state(
+        video_path=video_dir, async_loading_frames=False
+    )
+    height = inference_state["video_height"]
+    width = inference_state["video_width"]
+    input_palette = None
+    # fetch mask inputs from input_mask_dir (either only mask for the first frame, or all available masks)
+    if not use_all_masks:
+        # use only the first video's ground-truth mask as the input mask
+        input_frame_inds = [0]
+    else:
+        # use all mask files available in the input_mask_dir as the input masks
+        if not per_obj_png_file:
+            input_frame_inds = [
+                idx
+                for idx, name in enumerate(frame_names)
+                if os.path.exists(
+                    os.path.join(input_mask_dir, video_name, f"{name}.png")
+                )
+            ]
+        else:
+            input_frame_inds = [
+                idx
+                for object_name in os.listdir(os.path.join(input_mask_dir, video_name))
+                for idx, name in enumerate(frame_names)
+                if os.path.exists(
+                    os.path.join(input_mask_dir, video_name, object_name, f"{name}.png")
+                )
+            ]
+        # check and make sure we got at least one input frame
+        if len(input_frame_inds) == 0:
+            raise RuntimeError(
+                f"In {video_name=}, got no input masks in {input_mask_dir=}. "
+                "Please make sure the input masks are available in the correct format."
+            )
+        input_frame_inds = sorted(set(input_frame_inds))
+    # add those input masks to SAM 2 inference state before propagation
+    object_ids_set = None
+    for input_frame_idx in input_frame_inds:
+        try:
+            per_obj_input_mask, input_palette = load_masks_from_dir(
+                input_mask_dir=input_mask_dir,
+                video_name=video_name,
+                frame_name=frame_names[input_frame_idx],
+                per_obj_png_file=per_obj_png_file,
+            )
+        except FileNotFoundError as e:
+            raise RuntimeError(
+                f"In {video_name=}, failed to load input mask for frame {input_frame_idx=}. "
+                "Please add the `--track_object_appearing_later_in_video` flag "
+                "for VOS datasets that don't have all objects to track appearing "
+                "in the first frame (such as LVOS or YouTube-VOS)."
+            ) from e
+        # get the list of object ids to track from the first input frame
+        if object_ids_set is None:
+            object_ids_set = set(per_obj_input_mask)
+        for object_id, object_mask in per_obj_input_mask.items():
+            # check and make sure no new object ids appear only in later frames
+            if object_id not in object_ids_set:
+                raise RuntimeError(
+                    f"In {video_name=}, got a new {object_id=} appearing only in a "
+                    f"later {input_frame_idx=} (but not appearing in the first frame). "
+                    "Please add the `--track_object_appearing_later_in_video` flag "
+                    "for VOS datasets that don't have all objects to track appearing "
+                    "in the first frame (such as LVOS or YouTube-VOS)."
+                )
+            predictor.add_new_mask(
+                inference_state=inference_state,
+                frame_idx=input_frame_idx,
+                obj_id=object_id,
+                mask=object_mask,
+            )
+    # check and make sure we have at least one object to track
+    if object_ids_set is None or len(object_ids_set) == 0:
+        raise RuntimeError(
+            f"In {video_name=}, got no object ids on {input_frame_inds=}. "
+            "Please add the `--track_object_appearing_later_in_video` flag "
+            "for VOS datasets that don't have all objects to track appearing "
+            "in the first frame (such as LVOS or YouTube-VOS)."
+        )
+    # run propagation throughout the video and collect the results in a dict
+    os.makedirs(os.path.join(output_mask_dir, video_name), exist_ok=True)
+    output_palette = input_palette or DAVIS_PALETTE
+    video_segments = {}  # video_segments contains the per-frame segmentation results
+    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
+        inference_state
+    ):
+        per_obj_output_mask = {
+            out_obj_id: (out_mask_logits[i] > score_thresh).cpu().numpy()
+            for i, out_obj_id in enumerate(out_obj_ids)
+        }
+        video_segments[out_frame_idx] = per_obj_output_mask
+    # write the output masks as palette PNG files to output_mask_dir
+    for out_frame_idx, per_obj_output_mask in video_segments.items():
+        if save_palette_png:
+            # save palette PNG prediction results
+            save_palette_masks_to_dir(
+                output_mask_dir=output_mask_dir,
+                video_name=video_name,
+                frame_name=frame_names[out_frame_idx],
+                per_obj_output_mask=per_obj_output_mask,
+                height=height,
+                width=width,
+                per_obj_png_file=per_obj_png_file,
+                output_palette=output_palette,
+            )
+        else:
+            # save raw prediction results
+            save_masks_to_dir(
+                output_mask_dir=output_mask_dir,
+                video_name=video_name,
+                frame_name=frame_names[out_frame_idx],
+                per_obj_output_mask=per_obj_output_mask,
+                height=height,
+                width=width,
+                per_obj_png_file=per_obj_png_file,
+            )
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def vos_separate_inference_per_object(
+    predictor,
+    base_video_dir,
+    input_mask_dir,
+    output_mask_dir,
+    video_name,
+    score_thresh=0.0,
+    use_all_masks=False,
+    per_obj_png_file=False,
+):
+    """
+    Run inference on a single video with the given predictor.
+    Unlike `vos_inference`, this function run inference separately for each object
+    in a video, which could be applied to datasets like LVOS or YouTube-VOS that
+    don't have all objects to track appearing in the first frame (i.e. some objects
+    might appear only later in the video).
+    """
+    # load the video frames and initialize the inference state on this video
+    video_dir = os.path.join(base_video_dir, video_name)
+    frame_names = [
+        os.path.splitext(p)[0]
+        for p in os.listdir(video_dir)
+        if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
+    ]
+    frame_names = list(sorted(frame_names))
+    inference_state = predictor.init_state(
+        video_path=video_dir, async_loading_frames=False
+    )
+    height = inference_state["video_height"]
+    width = inference_state["video_width"]
+    input_palette = None
+    # collect all the object ids and their input masks
+    inputs_per_object = defaultdict(dict)
+    for idx, name in enumerate(frame_names):
+        if per_obj_png_file or os.path.exists(
+            os.path.join(input_mask_dir, video_name, f"{name}.png")
+        ):
+            per_obj_input_mask, input_palette = load_masks_from_dir(
+                input_mask_dir=input_mask_dir,
+                video_name=video_name,
+                frame_name=frame_names[idx],
+                per_obj_png_file=per_obj_png_file,
+                allow_missing=True,
+            )
+            for object_id, object_mask in per_obj_input_mask.items():
+                # skip empty masks
+                if not np.any(object_mask):
+                    continue
+                # if `use_all_masks=False`, we only use the first mask for each object
+                if len(inputs_per_object[object_id]) > 0 and not use_all_masks:
+                    continue
+                print(f"adding mask from frame {idx} as input for {object_id=}")
+                inputs_per_object[object_id][idx] = object_mask
+    # run inference separately for each object in the video
+    object_ids = sorted(inputs_per_object)
+    output_scores_per_object = defaultdict(dict)
+    for object_id in object_ids:
+        # add those input masks to SAM 2 inference state before propagation
+        input_frame_inds = sorted(inputs_per_object[object_id])
+        predictor.reset_state(inference_state)
+        for input_frame_idx in input_frame_inds:
+            predictor.add_new_mask(
+                inference_state=inference_state,
+                frame_idx=input_frame_idx,
+                obj_id=object_id,
+                mask=inputs_per_object[object_id][input_frame_idx],
+            )
+        # run propagation throughout the video and collect the results in a dict
+        for out_frame_idx, _, out_mask_logits in predictor.propagate_in_video(
+            inference_state,
+            start_frame_idx=min(input_frame_inds),
+            reverse=False,
+        ):
+            obj_scores = out_mask_logits.cpu().numpy()
+            output_scores_per_object[object_id][out_frame_idx] = obj_scores
+    # post-processing: consolidate the per-object scores into per-frame masks
+    os.makedirs(os.path.join(output_mask_dir, video_name), exist_ok=True)
+    output_palette = input_palette or DAVIS_PALETTE
+    video_segments = {}  # video_segments contains the per-frame segmentation results
+    for frame_idx in range(len(frame_names)):
+        scores = torch.full(
+            size=(len(object_ids), 1, height, width),
+            fill_value=-1024.0,
+            dtype=torch.float32,
+        )
+        for i, object_id in enumerate(object_ids):
+            if frame_idx in output_scores_per_object[object_id]:
+                scores[i] = torch.from_numpy(
+                    output_scores_per_object[object_id][frame_idx]
+                )
+        if not per_obj_png_file:
+            scores = predictor._apply_non_overlapping_constraints(scores)
+        per_obj_output_mask = {
+            object_id: (scores[i] > score_thresh).cpu().numpy()
+            for i, object_id in enumerate(object_ids)
+        }
+        video_segments[frame_idx] = per_obj_output_mask
+    # write the output masks as palette PNG files to output_mask_dir
+    for frame_idx, per_obj_output_mask in video_segments.items():
+        save_palette_masks_to_dir(
+            output_mask_dir=output_mask_dir,
+            video_name=video_name,
+            frame_name=frame_names[frame_idx],
+            per_obj_output_mask=per_obj_output_mask,
+            height=height,
+            width=width,
+            per_obj_png_file=per_obj_png_file,
+            output_palette=output_palette,
+        )
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--sam2_cfg",
+        type=str,
+        default="configs/sam2.1_hiera_t512.yaml",
+        help="MedSAM2  model configuration file",
+    )
+    parser.add_argument(
+        "--sam2_checkpoint",
+        type=str,
+        default="./checkpoints/MedSAM2_latest.pt",
+        help="path to the MedSAM2 model checkpoint",
+    )
+    parser.add_argument(
+        "-i",
+        "--base_video_dir",
+        type=str,
+        required=True,
+        help="directory containing videos (as JPEG files) to run inference on",
+    )
+    parser.add_argument(
+        "-m",
+        "--input_mask_dir",
+        type=str,
+        required=True,
+        help="directory containing input masks (as PNG files) of each video",
+    )
+    parser.add_argument(
+        "--video_list_file",
+        type=str,
+        default=None,
+        help="text file containing the list of video names to run inference on",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_mask_dir",
+        type=str,
+        required=True,
+        help="directory to save the output masks (as PNG files)",
+    )
+    parser.add_argument(
+        "--score_thresh",
+        type=float,
+        default=0.0,
+        help="threshold for the output mask logits (default: 0.0)",
+    )
+    parser.add_argument(
+        "--use_all_masks",
+        action="store_true",
+        help="whether to use all available PNG files in input_mask_dir "
+        "(default without this flag: just the first PNG file as input to the SAM 2 model; "
+        "usually we don't need this flag, since semi-supervised VOS evaluation usually takes input from the first frame only)",
+    )
+    parser.add_argument(
+        "--per_obj_png_file",
+        action="store_true",
+        help="whether use separate per-object PNG files for input and output masks "
+        "(default without this flag: all object masks are packed into a single PNG file on each frame following DAVIS format; "
+        "note that the SA-V dataset stores each object mask as an individual PNG file and requires this flag)",
+    )
+    parser.add_argument(
+        "--save_palette_png",
+        action="store_true",
+        help="whether to save palette PNG files for output masks "
+        "(default without this flag: all object masks are saved as grayscale PNG files (np.uint8) without palette)",
+    )
+    parser.add_argument(
+        "--apply_postprocessing",
+        action="store_true",
+        help="whether to apply postprocessing (e.g. hole-filling) to the output masks "
+        "(we don't apply such post-processing in the SAM 2 model evaluation)",
+    )
+    parser.add_argument(
+        "--track_object_appearing_later_in_video",
+        action="store_true",
+        help="whether to track objects that appear later in the video (i.e. not on the first frame; "
+        "some VOS datasets like LVOS or YouTube-VOS don't have all objects appearing in the first frame)",
+    )
+    parser.add_argument(
+        "--use_vos_optimized_video_predictor",
+        action="store_true",
+        help="whether to use vos optimized video predictor with all modules compiled",
+    )
+    args = parser.parse_args()
+    # if we use per-object PNG files, they could possibly overlap in inputs and outputs
+    hydra_overrides_extra = [
+        "++model.non_overlap_masks=" + ("false" if args.per_obj_png_file else "true")
+    ]
+    predictor = build_sam2_video_predictor(
+        config_file=args.sam2_cfg,
+        ckpt_path=args.sam2_checkpoint,
+        apply_postprocessing=args.apply_postprocessing,
+        hydra_overrides_extra=hydra_overrides_extra,
+        vos_optimized=args.use_vos_optimized_video_predictor,
+    )
+    if args.use_all_masks:
+        print("using all available masks in input_mask_dir as input to the MedSAM2 model")
+    else:
+        print(
+            "using only the first frame's mask in input_mask_dir as input to the MedSAM2 model"
+        )
+    # if a video list file is provided, read the video names from the file
+    # (otherwise, we use all subdirectories in base_video_dir)
+    if args.video_list_file is not None:
+        with open(args.video_list_file, "r") as f:
+            video_names = [v.strip() for v in f.readlines()]
+    else:
+        video_names = [
+            p
+            for p in os.listdir(args.base_video_dir)
+            if os.path.isdir(os.path.join(args.base_video_dir, p))
+        ]
+    print(f"running inference on {len(video_names)} videos:\n{video_names}")
+    for n_video, video_name in enumerate(video_names):
+        print(f"\n{n_video + 1}/{len(video_names)} - running on {video_name}")
+        if not args.track_object_appearing_later_in_video:
+            vos_inference(
+                predictor=predictor,
+                base_video_dir=args.base_video_dir,
+                input_mask_dir=args.input_mask_dir,
+                output_mask_dir=args.output_mask_dir,
+                video_name=video_name,
+                score_thresh=args.score_thresh,
+                use_all_masks=args.use_all_masks,
+                per_obj_png_file=args.per_obj_png_file,
+                save_palette_png=args.save_palette_png,
+            )
+        else:
+            vos_separate_inference_per_object(
+                predictor=predictor,
+                base_video_dir=args.base_video_dir,
+                input_mask_dir=args.input_mask_dir,
+                output_mask_dir=args.output_mask_dir,
+                video_name=video_name,
+                score_thresh=args.score_thresh,
+                use_all_masks=args.use_all_masks,
+                per_obj_png_file=args.per_obj_png_file,
+            )
+    print(
+        f"completed inference on {len(video_names)} videos -- "
+        f"output masks saved to {args.output_mask_dir}"
+    )
+if __name__ == "__main__":
+    main()

multi_node_train.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/bin/bash
+#SBATCH -t 7-00:0:0
+#SBATCH -J medsam2-tr-tiny
+#SBATCH --mem=450G
+#SBATCH -c 60
+#SBATCH -N 3
+#SBATCH --ntasks-per-node=1
+#SBATCH --gres=gpu:4
+#SBATCH -o out_mnodes_tiny.out
+export PATH=/usr/local/cuda/bin:$PATH
+timestamp=$(date +"%Y%m%d-%H%M")
+# Set the master node address (first node in the allocation)
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+# export MASTER_PORT=29500
+export MASTER_PORT=$(python - <<EOF
+import socket
+sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+sock.bind(('', 0))  # OS will allocate a free port
+free_port = sock.getsockname()[1]
+sock.close()
+print(free_port)
+EOF
+)
+# Print some information
+echo "Master node: $MASTER_ADDR"
+echo "Master port: $MASTER_PORT"
+echo "Number of nodes: $SLURM_NNODES"
+echo "GPUs per node: $SLURM_GPUS_ON_NODE"
+config=configs/sam2.1_hiera_tiny_finetune512.yaml
+output_path=./exp_log/mnode_tiny
+# Function to run the training script
+srun --exclusive python training/train.py \
+        -c $config \
+        --output-path $output_path \
+        --use-cluster 0 \
+        --num-gpus $SLURM_GPUS_ON_NODE \
+        --num-nodes $SLURM_NNODES \
+        --master-addr $MASTER_ADDR \
+        --main-port $MASTER_PORT
+echo "training done"

notebooks/MedSAM2_Inference_Video.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/MedSAM2_inference_CT_Lesion.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,6 @@

+[build-system]
+requires = [
+    "setuptools>=61.0",
+    "torch>=2.5.1",
+    ]
+build-backend = "setuptools.build_meta"

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+gradio==3.38.0
+torch>=2.0
+torchvision
+numpy
+SimpleITK
+nibabel
+opencv-python-headless
+imageio
+tqdm
+matplotlib
+einops
+omegaconf
+ffmpeg-python
+moviepy
+huggingface_hub
+hydra-core

sam2/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from hydra import initialize_config_module
+from hydra.core.global_hydra import GlobalHydra
+if not GlobalHydra.instance().is_initialized():
+    initialize_config_module("sam2", version_base="1.2")

sam2/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (431 Bytes). View file

sam2/__pycache__/build_sam.cpython-312.pyc ADDED Viewed

Binary file (5.16 kB). View file

sam2/__pycache__/sam2_image_predictor.cpython-312.pyc ADDED Viewed

Binary file (22.7 kB). View file

sam2/__pycache__/sam2_video_predictor_npz.cpython-312.pyc ADDED Viewed

Binary file (38 kB). View file

sam2/build_sam.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import torch
+from hydra import compose
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
+HF_MODEL_ID_TO_FILENAMES = {
+    "facebook/sam2-hiera-tiny": (
+        "configs/sam2/sam2_hiera_t.yaml",
+        "sam2_hiera_tiny.pt",
+    ),
+    "facebook/sam2-hiera-small": (
+        "configs/sam2/sam2_hiera_s.yaml",
+        "sam2_hiera_small.pt",
+    ),
+    "facebook/sam2-hiera-base-plus": (
+        "configs/sam2/sam2_hiera_b+.yaml",
+        "sam2_hiera_base_plus.pt",
+    ),
+    "facebook/sam2-hiera-large": (
+        "configs/sam2/sam2_hiera_l.yaml",
+        "sam2_hiera_large.pt",
+    ),
+    "facebook/sam2.1-hiera-tiny": (
+        "configs/sam2.1/sam2.1_hiera_t.yaml",
+        "sam2.1_hiera_tiny.pt",
+    ),
+    "facebook/sam2.1-hiera-small": (
+        "configs/sam2.1/sam2.1_hiera_s.yaml",
+        "sam2.1_hiera_small.pt",
+    ),
+    "facebook/sam2.1-hiera-base-plus": (
+        "configs/sam2.1/sam2.1_hiera_b+.yaml",
+        "sam2.1_hiera_base_plus.pt",
+    ),
+    "facebook/sam2.1-hiera-large": (
+        "configs/sam2.1/sam2.1_hiera_l.yaml",
+        "sam2.1_hiera_large.pt",
+    ),
+}
+def get_best_available_device():
+    """
+    Get the best available device in the order: CUDA, MPS, CPU
+    Returns: device string for torch.device
+    """
+    if torch.cuda.is_available():
+        return "cuda"
+    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        return "mps"
+    else:
+        return "cpu"
+def build_sam2(
+    config_file,
+    ckpt_path=None,
+    device=None,
+    mode="eval",
+    hydra_overrides_extra=[],
+    apply_postprocessing=True,
+    **kwargs,
+):
+    # Use the provided device or get the best available one
+    device = device or get_best_available_device()
+    logging.info(f"Using device: {device}")
+    if apply_postprocessing:
+        hydra_overrides_extra = hydra_overrides_extra.copy()
+        hydra_overrides_extra += [
+            # dynamically fall back to multi-mask if the single mask is not stable
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
+        ]
+    # Read config and init model
+    cfg = compose(config_name=config_file, overrides=hydra_overrides_extra)
+    OmegaConf.resolve(cfg)
+    model = instantiate(cfg.model, _recursive_=True)
+    _load_checkpoint(model, ckpt_path)
+    model = model.to(device)
+    if mode == "eval":
+        model.eval()
+    return model
+def build_sam2_video_predictor(
+    config_file,
+    ckpt_path=None,
+    device=None,
+    mode="eval",
+    hydra_overrides_extra=[],
+    apply_postprocessing=True,
+    **kwargs,
+):
+    # Use the provided device or get the best available one
+    device = device or get_best_available_device()
+    logging.info(f"Using device: {device}")
+    hydra_overrides = [
+        "++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictor",
+    ]
+    if apply_postprocessing:
+        hydra_overrides_extra = hydra_overrides_extra.copy()
+        hydra_overrides_extra += [
+            # dynamically fall back to multi-mask if the single mask is not stable
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
+            # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
+            "++model.binarize_mask_from_pts_for_mem_enc=true",
+            # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
+            "++model.fill_hole_area=8",
+        ]
+    hydra_overrides.extend(hydra_overrides_extra)
+    # Read config and init model
+    cfg = compose(config_name=config_file, overrides=hydra_overrides)
+    OmegaConf.resolve(cfg)
+    model = instantiate(cfg.model, _recursive_=True)
+    _load_checkpoint(model, ckpt_path)
+    model = model.to(device)
+    if mode == "eval":
+        model.eval()
+    return model
+def build_sam2_video_predictor_npz(
+    config_file,
+    ckpt_path=None,
+    device=None,
+    mode="eval",
+    hydra_overrides_extra=[],
+    apply_postprocessing=True,
+    **kwargs,
+):
+    # Use the provided device or get the best available one
+    device = device or get_best_available_device()
+    logging.info(f"Using device: {device}")
+    hydra_overrides = [
+        "++model._target_=sam2.sam2_video_predictor_npz.SAM2VideoPredictorNPZ",
+    ]
+    if apply_postprocessing:
+        hydra_overrides_extra = hydra_overrides_extra.copy()
+        hydra_overrides_extra += [
+            # dynamically fall back to multi-mask if the single mask is not stable
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
+            # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
+            "++model.binarize_mask_from_pts_for_mem_enc=true",
+            # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
+            "++model.fill_hole_area=8",
+        ]
+    hydra_overrides.extend(hydra_overrides_extra)
+    # Read config and init model
+    cfg = compose(config_name=config_file, overrides=hydra_overrides)
+    OmegaConf.resolve(cfg)
+    model = instantiate(cfg.model, _recursive_=True)
+    _load_checkpoint(model, ckpt_path)
+    model = model.to(device)
+    if mode == "eval":
+        model.eval()
+    return model
+def _hf_download(model_id):
+    from huggingface_hub import hf_hub_download
+    config_name, checkpoint_name = HF_MODEL_ID_TO_FILENAMES[model_id]
+    ckpt_path = hf_hub_download(repo_id=model_id, filename=checkpoint_name)
+    return config_name, ckpt_path
+def build_sam2_hf(model_id, **kwargs):
+    config_name, ckpt_path = _hf_download(model_id)
+    return build_sam2(config_file=config_name, ckpt_path=ckpt_path, **kwargs)
+def build_sam2_video_predictor_hf(model_id, **kwargs):
+    config_name, ckpt_path = _hf_download(model_id)
+    return build_sam2_video_predictor(
+        config_file=config_name, ckpt_path=ckpt_path, **kwargs
+    )
+def _load_checkpoint(model, ckpt_path):
+    if ckpt_path is not None:
+        sd = torch.load(ckpt_path, map_location="cpu", weights_only=True)["model"]
+        missing_keys, unexpected_keys = model.load_state_dict(sd)
+        if missing_keys:
+            logging.error(missing_keys)
+            raise RuntimeError()
+        if unexpected_keys:
+            logging.error(unexpected_keys)
+            raise RuntimeError()
+        logging.info("Loaded checkpoint sucessfully")

sam2/configs/sam2.1_hiera_t512.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 7, 2]
+      global_att_blocks: [5, 7, 9]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 512
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  # SAM decoder
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  no_obj_embed_spatial: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: true
+  proj_tpos_enc_in_obj_ptrs: true
+  use_signed_tpos_enc_to_obj_ptrs: true
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  # HieraT does not currently support compilation, should always be set to False
+  compile_image_encoder: False

sam2/configs/sam2.1_hiera_tiny_finetune512.yaml ADDED Viewed

	@@ -0,0 +1,389 @@

+# @package _global_
+scratch:
+  resolution: 512
+  train_video_batch_size: 8
+  num_train_workers: 15
+  num_frames: 8
+  max_num_objects: 5
+  base_lr: 5.0e-5
+  vision_lr: 3.0e-05
+  phases_per_epoch: 1
+  num_epochs: 75
+dataset:
+  # PATHS to Dataset
+  folder:  # PATH to Med NPZ folder
+  multiplier: 1
+# Video transforms
+vos:
+  train_transforms:
+    - _target_: training.dataset.transforms.ComposeAPI
+      transforms:
+        - _target_: training.dataset.transforms.RandomHorizontalFlip
+          consistent_transform: True
+        - _target_: training.dataset.transforms.RandomAffine
+          degrees: 25
+          shear: 20
+          image_interpolation: bilinear
+          consistent_transform: True
+        - _target_: training.dataset.transforms.RandomResizeAPI
+          sizes: ${scratch.resolution}
+          square: true
+          consistent_transform: True
+        - _target_: training.dataset.transforms.ColorJitter
+          consistent_transform: True
+          brightness: 0.1
+          contrast: 0.03
+          saturation: 0.03
+          hue: null
+        - _target_: training.dataset.transforms.RandomGrayscale
+          p: 0.05
+          consistent_transform: True
+        - _target_: training.dataset.transforms.ColorJitter
+          consistent_transform: False
+          brightness: 0.1
+          contrast: 0.05
+          saturation: 0.05
+          hue: null
+        - _target_: training.dataset.transforms.ToTensorAPI
+        - _target_: training.dataset.transforms.NormalizeAPI
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+trainer:
+  _target_: training.trainer.Trainer
+  mode: train_only
+  max_epochs: ${times:${scratch.num_epochs},${scratch.phases_per_epoch}}
+  accelerator: cuda
+  seed_value: 123
+  model:
+    _target_: training.model.sam2.SAM2Train
+    image_encoder:
+      _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+      scalp: 1
+      trunk:
+        _target_: sam2.modeling.backbones.hieradet.Hiera
+        embed_dim: 96
+        num_heads: 1
+        stages: [1, 2, 7, 2]
+        global_att_blocks: [5, 7, 9]
+        window_pos_embed_bkg_spatial_size: [7, 7]
+      neck:
+        _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+        position_encoding:
+          _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+          num_pos_feats: 256
+          normalize: true
+          scale: null
+          temperature: 10000
+        d_model: 256
+        backbone_channel_list: [768, 384, 192, 96]
+        fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+        fpn_interp_model: nearest
+    memory_attention:
+      _target_: sam2.modeling.memory_attention.MemoryAttention
+      d_model: 256
+      pos_enc_at_input: true
+      layer:
+        _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+        activation: relu
+        dim_feedforward: 2048
+        dropout: 0.1
+        pos_enc_at_attn: false
+        self_attention:
+          _target_: sam2.modeling.sam.transformer.RoPEAttention
+          rope_theta: 10000.0
+          feat_sizes: [32, 32]
+          embedding_dim: 256
+          num_heads: 1
+          downsample_rate: 1
+          dropout: 0.1
+        d_model: 256
+        pos_enc_at_cross_attn_keys: true
+        pos_enc_at_cross_attn_queries: false
+        cross_attention:
+          _target_: sam2.modeling.sam.transformer.RoPEAttention
+          rope_theta: 10000.0
+          feat_sizes: [32, 32]
+          rope_k_repeat: True
+          embedding_dim: 256
+          num_heads: 1
+          downsample_rate: 1
+          dropout: 0.1
+          kv_in_dim: 64
+      num_layers: 4
+    memory_encoder:
+        _target_: sam2.modeling.memory_encoder.MemoryEncoder
+        out_dim: 64
+        position_encoding:
+          _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+          num_pos_feats: 64
+          normalize: true
+          scale: null
+          temperature: 10000
+        mask_downsampler:
+          _target_: sam2.modeling.memory_encoder.MaskDownSampler
+          kernel_size: 3
+          stride: 2
+          padding: 1
+        fuser:
+          _target_: sam2.modeling.memory_encoder.Fuser
+          layer:
+            _target_: sam2.modeling.memory_encoder.CXBlock
+            dim: 256
+            kernel_size: 7
+            padding: 3
+            layer_scale_init_value: 1e-6
+            use_dwconv: True  # depth-wise convs
+          num_layers: 2
+    num_maskmem: 7
+    image_size: ${scratch.resolution}
+    # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+    # SAM decoder
+    sigmoid_scale_for_mem_enc: 20.0
+    sigmoid_bias_for_mem_enc: -10.0
+    use_mask_input_as_output_without_sam: true
+    # Memory
+    directly_add_no_mem_embed: true
+    no_obj_embed_spatial: true
+    # use high-resolution feature map in the SAM mask decoder
+    use_high_res_features_in_sam: true
+    # output 3 masks on the first click on initial conditioning frames
+    multimask_output_in_sam: true
+    # SAM heads
+    iou_prediction_use_sigmoid: True
+    # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+    use_obj_ptrs_in_encoder: true
+    add_tpos_enc_to_obj_ptrs: true
+    proj_tpos_enc_in_obj_ptrs: true
+    use_signed_tpos_enc_to_obj_ptrs: true
+    only_obj_ptrs_in_the_past_for_eval: true
+    # object occlusion prediction
+    pred_obj_scores: true
+    pred_obj_scores_mlp: true
+    fixed_no_obj_ptr: true
+    # multimask tracking settings
+    multimask_output_for_tracking: true
+    use_multimask_token_for_obj_ptr: true
+    multimask_min_pt_num: 0
+    multimask_max_pt_num: 1
+    use_mlp_for_obj_ptr_proj: true
+    # Compilation flag
+    # compile_image_encoder: False
+    ####### Training specific params #######
+    # box/point input and corrections
+    prob_to_use_pt_input_for_train: 0.5
+    prob_to_use_pt_input_for_eval: 0.0
+    prob_to_use_box_input_for_train: 1.0
+    prob_to_use_box_input_for_eval: 0.0
+    prob_to_sample_from_gt_for_train: 0.1  # with a small prob, sampling correction points from GT mask instead of prediction errors
+    num_frames_to_correct_for_train: 2  # iteratively sample on random 1~2 frames (always include the first frame)
+    num_frames_to_correct_for_eval: 1  # only iteratively sample on first frame
+    rand_frames_to_correct_for_train: True  # random #init-cond-frame ~ 2
+    add_all_frames_to_correct_as_cond: True  # when a frame receives a correction click, it becomes a conditioning frame (even if it's not initially a conditioning frame)
+    # maximum 2 initial conditioning frames
+    num_init_cond_frames_for_train: 2
+    rand_init_cond_frames_for_train: True  # random 1~2
+    num_correction_pt_per_frame: 7
+    use_act_ckpt_iterative_pt_sampling: false
+    num_init_cond_frames_for_eval: 1  # only mask on the first frame
+    forward_backbone_per_frame_for_eval: True
+  data:
+    train:
+      _target_: training.dataset.sam2_datasets.TorchTrainMixedDataset
+      phases_per_epoch: ${scratch.phases_per_epoch}
+      batch_sizes:
+        - ${scratch.train_video_batch_size}
+      datasets:
+        - _target_: training.dataset.utils.RepeatFactorWrapper
+          dataset:
+            _target_: training.dataset.utils.ConcatDataset
+            datasets:
+            # CT
+            - _target_: training.dataset.vos_dataset.VOSDataset
+              transforms: ${vos.train_transforms}
+              training: true
+              video_dataset:
+                _target_: training.dataset.vos_raw_dataset.NPZRawDataset
+                folder: CVPR25/3D_train_npz_random_10percent_16G/CT
+              sampler:
+                _target_: training.dataset.vos_sampler.RandomUniformSampler
+                num_frames: ${scratch.num_frames}
+                max_num_objects: ${scratch.max_num_objects}
+              multiplier: 1
+            # MR
+            - _target_: training.dataset.vos_dataset.VOSDataset
+              transforms: ${vos.train_transforms}
+              training: true
+              video_dataset:
+                _target_: training.dataset.vos_raw_dataset.NPZRawDataset
+                folder: CVPR25/3D_train_npz_random_10percent_16G/MR
+              sampler:
+                _target_: training.dataset.vos_sampler.RandomUniformSampler
+                num_frames: ${scratch.num_frames}
+                max_num_objects: ${scratch.max_num_objects}
+              multiplier: 1
+            # PET
+            - _target_: training.dataset.vos_dataset.VOSDataset
+              transforms: ${vos.train_transforms}
+              training: true
+              video_dataset:
+                _target_: training.dataset.vos_raw_dataset.NPZRawDataset
+                folder: CVPR25/3D_train_npz_random_10percent_16G/PET
+              sampler:
+                _target_: training.dataset.vos_sampler.RandomUniformSampler
+                num_frames: ${scratch.num_frames}
+                max_num_objects: ${scratch.max_num_objects}
+              multiplier: 10
+            # Ultrasound 3D
+            - _target_: training.dataset.vos_dataset.VOSDataset
+              transforms: ${vos.train_transforms}
+              training: true
+              video_dataset:
+                _target_: training.dataset.vos_raw_dataset.NPZRawDataset
+                folder: CVPR25/3D_train_npz_random_10percent_16G/US3D
+              sampler:
+                _target_: training.dataset.vos_sampler.RandomUniformSampler
+                num_frames: ${scratch.num_frames}
+                max_num_objects: ${scratch.max_num_objects}
+              multiplier: 1
+            # Microscopy 3D
+            - _target_: training.dataset.vos_dataset.VOSDataset
+              transforms: ${vos.train_transforms}
+              training: true
+              video_dataset:
+                _target_: training.dataset.vos_raw_dataset.NPZRawDataset
+                folder: CVPR25/3D_train_npz_random_10percent_16G/Microscopy
+              sampler:
+                _target_: training.dataset.vos_sampler.RandomUniformSampler
+                num_frames: ${scratch.num_frames}
+                max_num_objects: ${scratch.max_num_objects}
+              multiplier: 1
+      shuffle: True
+      num_workers: ${scratch.num_train_workers}
+      pin_memory: True
+      drop_last: True
+      collate_fn:
+        _target_: training.utils.data_utils.collate_fn
+        _partial_: true
+        dict_key: all
+  optim:
+    amp:
+      enabled: True
+      amp_dtype: bfloat16
+    optimizer:
+      _target_: torch.optim.AdamW
+    gradient_clip:
+      _target_: training.optimizer.GradientClipper
+      max_norm: 0.1
+      norm_type: 2
+    param_group_modifiers:
+      - _target_: training.optimizer.layer_decay_param_modifier
+        _partial_: True
+        layer_decay_value: 0.9
+        apply_to: 'image_encoder.trunk'
+        overrides:
+          - pattern: '*pos_embed*'
+            value: 1.0
+    options:
+      lr:
+        - scheduler:
+            _target_: fvcore.common.param_scheduler.CosineParamScheduler
+            start_value: ${scratch.base_lr}
+            end_value: ${divide:${scratch.base_lr},10}
+        - scheduler:
+            _target_: fvcore.common.param_scheduler.CosineParamScheduler
+            start_value: ${scratch.vision_lr}
+            end_value: ${divide:${scratch.vision_lr},10}
+          param_names:
+            - 'image_encoder.*'
+      weight_decay:
+        - scheduler:
+            _target_: fvcore.common.param_scheduler.ConstantParamScheduler
+            value: 0.1
+        - scheduler:
+            _target_: fvcore.common.param_scheduler.ConstantParamScheduler
+            value: 0.0
+          param_names:
+            - '*bias*'
+          module_cls_names: ['torch.nn.LayerNorm']
+  loss:
+    all:
+      _target_: training.loss_fns.MultiStepMultiMasksAndIous
+      weight_dict:
+        loss_mask: 20
+        loss_dice: 1
+        loss_iou: 1
+        loss_class: 1
+      supervise_all_iou: true
+      iou_use_l1_loss: true
+      pred_obj_scores: true
+      focal_gamma_obj_score: 0.0
+      focal_alpha_obj_score: -1.0
+  distributed:
+    backend: nccl #  gloo or nccl
+    find_unused_parameters: True
+  logging:
+    tensorboard_writer:
+      _target_: training.utils.logger.make_tensorboard_logger
+      log_dir:  ${launcher.experiment_log_dir}/tensorboard
+      flush_secs: 120
+      should_log: True
+    log_dir: ${launcher.experiment_log_dir}/logs
+    log_freq: 10
+  # initialize from a SAM 2 checkpoint
+  checkpoint:
+    save_dir: ${launcher.experiment_log_dir}/checkpoints
+    save_freq: 10 # 0 only last checkpoint is saved.
+    model_weight_initializer:
+      _partial_: True
+      _target_: training.utils.checkpoint_utils.load_state_dict_into_model
+      strict: True
+      ignore_unexpected_keys: null
+      ignore_missing_keys: null
+      state_dict:
+        _target_: training.utils.checkpoint_utils.load_checkpoint_and_apply_kernels
+        checkpoint_path: checkpoints/sam2.1_hiera_tiny.pt # PATH to SAM 2.1 checkpoint
+        ckpt_state_dict_keys: ['model']
+launcher:
+  num_nodes: 1
+  gpus_per_node: 4
+  experiment_log_dir: exp_log # Path to log directory, defaults to ./sam2_logs/${config_name}
+# SLURM args if running on a cluster
+submitit:
+  partition: gpu_bwanggroup
+  account: null
+  qos: null
+  cpus_per_task: 10
+  use_cluster: false
+  timeout_hour: 24
+  name: null
+  port_range: [10000, 65000]

sam2/csrc/connected_components.cu ADDED Viewed

	@@ -0,0 +1,289 @@

+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+// adapted from https://github.com/zsef123/Connected_components_PyTorch
+// with license found in the LICENSE_cctorch file in the root directory.
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+#include <torch/script.h>
+#include <vector>
+// 2d
+#define BLOCK_ROWS 16
+#define BLOCK_COLS 16
+namespace cc2d {
+template <typename T>
+__device__ __forceinline__ unsigned char hasBit(T bitmap, unsigned char pos) {
+  return (bitmap >> pos) & 1;
+}
+__device__ int32_t find(const int32_t* s_buf, int32_t n) {
+  while (s_buf[n] != n)
+    n = s_buf[n];
+  return n;
+}
+__device__ int32_t find_n_compress(int32_t* s_buf, int32_t n) {
+  const int32_t id = n;
+  while (s_buf[n] != n) {
+    n = s_buf[n];
+    s_buf[id] = n;
+  }
+  return n;
+}
+__device__ void union_(int32_t* s_buf, int32_t a, int32_t b) {
+  bool done;
+  do {
+    a = find(s_buf, a);
+    b = find(s_buf, b);
+    if (a < b) {
+      int32_t old = atomicMin(s_buf + b, a);
+      done = (old == b);
+      b = old;
+    } else if (b < a) {
+      int32_t old = atomicMin(s_buf + a, b);
+      done = (old == a);
+      a = old;
+    } else
+      done = true;
+  } while (!done);
+}
+__global__ void
+init_labeling(int32_t* label, const uint32_t W, const uint32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+  if (row < H && col < W)
+    label[idx] = idx;
+}
+__global__ void
+merge(uint8_t* img, int32_t* label, const uint32_t W, const uint32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+  if (row >= H || col >= W)
+    return;
+  uint32_t P = 0;
+  if (img[idx])
+    P |= 0x777;
+  if (row + 1 < H && img[idx + W])
+    P |= 0x777 << 4;
+  if (col + 1 < W && img[idx + 1])
+    P |= 0x777 << 1;
+  if (col == 0)
+    P &= 0xEEEE;
+  if (col + 1 >= W)
+    P &= 0x3333;
+  else if (col + 2 >= W)
+    P &= 0x7777;
+  if (row == 0)
+    P &= 0xFFF0;
+  if (row + 1 >= H)
+    P &= 0xFF;
+  if (P > 0) {
+    // If need check about top-left pixel(if flag the first bit) and hit the
+    // top-left pixel
+    if (hasBit(P, 0) && img[idx - W - 1]) {
+      union_(label, idx, idx - 2 * W - 2); // top left block
+    }
+    if ((hasBit(P, 1) && img[idx - W]) || (hasBit(P, 2) && img[idx - W + 1]))
+      union_(label, idx, idx - 2 * W); // top bottom block
+    if (hasBit(P, 3) && img[idx + 2 - W])
+      union_(label, idx, idx - 2 * W + 2); // top right block
+    if ((hasBit(P, 4) && img[idx - 1]) || (hasBit(P, 8) && img[idx + W - 1]))
+      union_(label, idx, idx - 2); // just left block
+  }
+}
+__global__ void compression(int32_t* label, const int32_t W, const int32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+  if (row < H && col < W)
+    find_n_compress(label, idx);
+}
+__global__ void final_labeling(
+    const uint8_t* img,
+    int32_t* label,
+    const int32_t W,
+    const int32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t idx = row * W + col;
+  if (row >= H || col >= W)
+    return;
+  int32_t y = label[idx] + 1;
+  if (img[idx])
+    label[idx] = y;
+  else
+    label[idx] = 0;
+  if (col + 1 < W) {
+    if (img[idx + 1])
+      label[idx + 1] = y;
+    else
+      label[idx + 1] = 0;
+    if (row + 1 < H) {
+      if (img[idx + W + 1])
+        label[idx + W + 1] = y;
+      else
+        label[idx + W + 1] = 0;
+    }
+  }
+  if (row + 1 < H) {
+    if (img[idx + W])
+      label[idx + W] = y;
+    else
+      label[idx + W] = 0;
+  }
+}
+__global__ void init_counting(
+    const int32_t* label,
+    int32_t* count_init,
+    const int32_t W,
+    const int32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
+  const uint32_t idx = row * W + col;
+  if (row >= H || col >= W)
+    return;
+  int32_t y = label[idx];
+  if (y > 0) {
+    int32_t count_idx = y - 1;
+    atomicAdd(count_init + count_idx, 1);
+  }
+}
+__global__ void final_counting(
+    const int32_t* label,
+    const int32_t* count_init,
+    int32_t* count_final,
+    const int32_t W,
+    const int32_t H) {
+  const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y);
+  const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x);
+  const uint32_t idx = row * W + col;
+  if (row >= H || col >= W)
+    return;
+  int32_t y = label[idx];
+  if (y > 0) {
+    int32_t count_idx = y - 1;
+    count_final[idx] = count_init[count_idx];
+  } else {
+    count_final[idx] = 0;
+  }
+}
+} // namespace cc2d
+std::vector<torch::Tensor> get_connected_componnets(
+    const torch::Tensor& inputs) {
+  AT_ASSERTM(inputs.is_cuda(), "inputs must be a CUDA tensor");
+  AT_ASSERTM(inputs.ndimension() == 4, "inputs must be [N, 1, H, W] shape");
+  AT_ASSERTM(
+      inputs.scalar_type() == torch::kUInt8, "inputs must be a uint8 type");
+  const uint32_t N = inputs.size(0);
+  const uint32_t C = inputs.size(1);
+  const uint32_t H = inputs.size(2);
+  const uint32_t W = inputs.size(3);
+  AT_ASSERTM(C == 1, "inputs must be [N, 1, H, W] shape");
+  AT_ASSERTM((H % 2) == 0, "height must be an even number");
+  AT_ASSERTM((W % 2) == 0, "width must be an even number");
+  // label must be uint32_t
+  auto label_options =
+      torch::TensorOptions().dtype(torch::kInt32).device(inputs.device());
+  torch::Tensor labels = torch::zeros({N, C, H, W}, label_options);
+  torch::Tensor counts_init = torch::zeros({N, C, H, W}, label_options);
+  torch::Tensor counts_final = torch::zeros({N, C, H, W}, label_options);
+  dim3 grid = dim3(
+      ((W + 1) / 2 + BLOCK_COLS - 1) / BLOCK_COLS,
+      ((H + 1) / 2 + BLOCK_ROWS - 1) / BLOCK_ROWS);
+  dim3 block = dim3(BLOCK_COLS, BLOCK_ROWS);
+  dim3 grid_count =
+      dim3((W + BLOCK_COLS) / BLOCK_COLS, (H + BLOCK_ROWS) / BLOCK_ROWS);
+  dim3 block_count = dim3(BLOCK_COLS, BLOCK_ROWS);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  for (int n = 0; n < N; n++) {
+    uint32_t offset = n * H * W;
+    cc2d::init_labeling<<<grid, block, 0, stream>>>(
+        labels.data_ptr<int32_t>() + offset, W, H);
+    cc2d::merge<<<grid, block, 0, stream>>>(
+        inputs.data_ptr<uint8_t>() + offset,
+        labels.data_ptr<int32_t>() + offset,
+        W,
+        H);
+    cc2d::compression<<<grid, block, 0, stream>>>(
+        labels.data_ptr<int32_t>() + offset, W, H);
+    cc2d::final_labeling<<<grid, block, 0, stream>>>(
+        inputs.data_ptr<uint8_t>() + offset,
+        labels.data_ptr<int32_t>() + offset,
+        W,
+        H);
+    // get the counting of each pixel
+    cc2d::init_counting<<<grid_count, block_count, 0, stream>>>(
+        labels.data_ptr<int32_t>() + offset,
+        counts_init.data_ptr<int32_t>() + offset,
+        W,
+        H);
+    cc2d::final_counting<<<grid_count, block_count, 0, stream>>>(
+        labels.data_ptr<int32_t>() + offset,
+        counts_init.data_ptr<int32_t>() + offset,
+        counts_final.data_ptr<int32_t>() + offset,
+        W,
+        H);
+  }
+  // returned values are [labels, counts]
+  std::vector<torch::Tensor> outputs;
+  outputs.push_back(labels);
+  outputs.push_back(counts_final);
+  return outputs;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def(
+      "get_connected_componnets",
+      &get_connected_componnets,
+      "get_connected_componnets");
+}

sam2/modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

sam2/modeling/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (146 Bytes). View file

sam2/modeling/__pycache__/memory_attention.cpython-312.pyc ADDED Viewed

Binary file (6.79 kB). View file

sam2/modeling/__pycache__/memory_encoder.cpython-312.pyc ADDED Viewed

Binary file (7.82 kB). View file

sam2/modeling/__pycache__/position_encoding.cpython-312.pyc ADDED Viewed

Binary file (14.5 kB). View file

sam2/modeling/__pycache__/sam2_base.cpython-312.pyc ADDED Viewed

Binary file (30.6 kB). View file

sam2/modeling/__pycache__/sam2_utils.cpython-312.pyc ADDED Viewed

Binary file (17.4 kB). View file

sam2/modeling/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

sam2/modeling/backbones/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (167 Bytes). View file

sam2/modeling/backbones/__pycache__/hieradet.cpython-312.pyc ADDED Viewed

Binary file (13.3 kB). View file

sam2/modeling/backbones/__pycache__/image_encoder.cpython-312.pyc ADDED Viewed

Binary file (5.47 kB). View file

sam2/modeling/backbones/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (4.31 kB). View file

sam2/modeling/backbones/hieradet.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from functools import partial
+from typing import List, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from iopath.common.file_io import g_pathmgr
+from sam2.modeling.backbones.utils import (
+    PatchEmbed,
+    window_partition,
+    window_unpartition,
+)
+from sam2.modeling.sam2_utils import DropPath, MLP
+def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
+    if pool is None:
+        return x
+    # (B, H, W, C) -> (B, C, H, W)
+    x = x.permute(0, 3, 1, 2)
+    x = pool(x)
+    # (B, C, H', W') -> (B, H', W', C)
+    x = x.permute(0, 2, 3, 1)
+    if norm:
+        x = norm(x)
+    return x
+class MultiScaleAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        q_pool: nn.Module = None,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.dim_out = dim_out
+        self.num_heads = num_heads
+        self.q_pool = q_pool
+        self.qkv = nn.Linear(dim, dim_out * 3)
+        self.proj = nn.Linear(dim_out, dim_out)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = x.shape
+        # qkv with shape (B, H * W, 3, nHead, C)
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1)
+        # q, k, v with shape (B, H * W, nheads, C)
+        q, k, v = torch.unbind(qkv, 2)
+        # Q pooling (for downsample at stage changes)
+        if self.q_pool:
+            q = do_pool(q.reshape(B, H, W, -1), self.q_pool)
+            H, W = q.shape[1:3]  # downsampled shape
+            q = q.reshape(B, H * W, self.num_heads, -1)
+        # Torch's SDPA expects [B, nheads, H*W, C] so we transpose
+        x = F.scaled_dot_product_attention(
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+        )
+        # Transpose back
+        x = x.transpose(1, 2)
+        x = x.reshape(B, H, W, -1)
+        x = self.proj(x)
+        return x
+class MultiScaleBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        drop_path: float = 0.0,
+        norm_layer: Union[nn.Module, str] = "LayerNorm",
+        q_stride: Tuple[int, int] = None,
+        act_layer: nn.Module = nn.GELU,
+        window_size: int = 0,
+    ):
+        super().__init__()
+        if isinstance(norm_layer, str):
+            norm_layer = partial(getattr(nn, norm_layer), eps=1e-6)
+        self.dim = dim
+        self.dim_out = dim_out
+        self.norm1 = norm_layer(dim)
+        self.window_size = window_size
+        self.pool, self.q_stride = None, q_stride
+        if self.q_stride:
+            self.pool = nn.MaxPool2d(
+                kernel_size=q_stride, stride=q_stride, ceil_mode=False
+            )
+        self.attn = MultiScaleAttention(
+            dim,
+            dim_out,
+            num_heads=num_heads,
+            q_pool=self.pool,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim_out)
+        self.mlp = MLP(
+            dim_out,
+            int(dim_out * mlp_ratio),
+            dim_out,
+            num_layers=2,
+            activation=act_layer,
+        )
+        if dim != dim_out:
+            self.proj = nn.Linear(dim, dim_out)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x  # B, H, W, C
+        x = self.norm1(x)
+        # Skip connection
+        if self.dim != self.dim_out:
+            shortcut = do_pool(self.proj(x), self.pool)
+        # Window partition
+        window_size = self.window_size
+        if window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, window_size)
+        # Window Attention + Q Pooling (if stage change)
+        x = self.attn(x)
+        if self.q_stride:
+            # Shapes have changed due to Q pooling
+            window_size = self.window_size // self.q_stride[0]
+            H, W = shortcut.shape[1:3]
+            pad_h = (window_size - H % window_size) % window_size
+            pad_w = (window_size - W % window_size) % window_size
+            pad_hw = (H + pad_h, W + pad_w)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, window_size, pad_hw, (H, W))
+        x = shortcut + self.drop_path(x)
+        # MLP
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class Hiera(nn.Module):
+    """
+    Reference: https://arxiv.org/abs/2306.00989
+    """
+    def __init__(
+        self,
+        embed_dim: int = 96,  # initial embed dim
+        num_heads: int = 1,  # initial number of heads
+        drop_path_rate: float = 0.0,  # stochastic depth
+        q_pool: int = 3,  # number of q_pool stages
+        q_stride: Tuple[int, int] = (2, 2),  # downsample stride bet. stages
+        stages: Tuple[int, ...] = (2, 3, 16, 3),  # blocks per stage
+        dim_mul: float = 2.0,  # dim_mul factor at stage shift
+        head_mul: float = 2.0,  # head_mul factor at stage shift
+        window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14),
+        # window size per stage, when not using global att.
+        window_spec: Tuple[int, ...] = (
+            8,
+            4,
+            14,
+            7,
+        ),
+        # global attn in these blocks
+        global_att_blocks: Tuple[int, ...] = (
+            12,
+            16,
+            20,
+        ),
+        weights_path=None,
+        return_interm_layers=True,  # return feats from every stage
+    ):
+        super().__init__()
+        assert len(stages) == len(window_spec)
+        self.window_spec = window_spec
+        depth = sum(stages)
+        self.q_stride = q_stride
+        self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
+        assert 0 <= q_pool <= len(self.stage_ends[:-1])
+        self.q_pool_blocks = [x + 1 for x in self.stage_ends[:-1]][:q_pool]
+        self.return_interm_layers = return_interm_layers
+        self.patch_embed = PatchEmbed(
+            embed_dim=embed_dim,
+        )
+        # Which blocks have global att?
+        self.global_att_blocks = global_att_blocks
+        # Windowed positional embedding (https://arxiv.org/abs/2311.05613)
+        self.window_pos_embed_bkg_spatial_size = window_pos_embed_bkg_spatial_size
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, embed_dim, *self.window_pos_embed_bkg_spatial_size)
+        )
+        self.pos_embed_window = nn.Parameter(
+            torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0])
+        )
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        cur_stage = 1
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            dim_out = embed_dim
+            # lags by a block, so first block of
+            # next stage uses an initial window size
+            # of previous stage and final window size of current stage
+            window_size = self.window_spec[cur_stage - 1]
+            if self.global_att_blocks is not None:
+                window_size = 0 if i in self.global_att_blocks else window_size
+            if i - 1 in self.stage_ends:
+                dim_out = int(embed_dim * dim_mul)
+                num_heads = int(num_heads * head_mul)
+                cur_stage += 1
+            block = MultiScaleBlock(
+                dim=embed_dim,
+                dim_out=dim_out,
+                num_heads=num_heads,
+                drop_path=dpr[i],
+                q_stride=self.q_stride if i in self.q_pool_blocks else None,
+                window_size=window_size,
+            )
+            embed_dim = dim_out
+            self.blocks.append(block)
+        self.channel_list = (
+            [self.blocks[i].dim_out for i in self.stage_ends[::-1]]
+            if return_interm_layers
+            else [self.blocks[-1].dim_out]
+        )
+        if weights_path is not None:
+            with g_pathmgr.open(weights_path, "rb") as f:
+                chkpt = torch.load(f, map_location="cpu")
+            logging.info("loading Hiera", self.load_state_dict(chkpt, strict=False))
+    def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
+        h, w = hw
+        window_embed = self.pos_embed_window
+        pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic")
+        pos_embed = pos_embed + window_embed.tile(
+            [x // y for x, y in zip(pos_embed.shape, window_embed.shape)]
+        )
+        pos_embed = pos_embed.permute(0, 2, 3, 1)
+        return pos_embed
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        x = self.patch_embed(x)
+        # x: (B, H, W, C)
+        # Add pos embed
+        x = x + self._get_pos_embed(x.shape[1:3])
+        outputs = []
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if (i == self.stage_ends[-1]) or (
+                i in self.stage_ends and self.return_interm_layers
+            ):
+                feats = x.permute(0, 3, 1, 2)
+                outputs.append(feats)
+        return outputs
+    def get_layer_id(self, layer_name):
+        # https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
+        num_layers = self.get_num_layers()
+        if layer_name.find("rel_pos") != -1:
+            return num_layers + 1
+        elif layer_name.find("pos_embed") != -1:
+            return 0
+        elif layer_name.find("patch_embed") != -1:
+            return 0
+        elif layer_name.find("blocks") != -1:
+            return int(layer_name.split("blocks")[1].split(".")[1]) + 1
+        else:
+            return num_layers + 1
+    def get_num_layers(self) -> int:
+        return len(self.blocks)

sam2/modeling/backbones/image_encoder.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ImageEncoder(nn.Module):
+    def __init__(
+        self,
+        trunk: nn.Module,
+        neck: nn.Module,
+        scalp: int = 0,
+    ):
+        super().__init__()
+        self.trunk = trunk
+        self.neck = neck
+        self.scalp = scalp
+        assert (
+            self.trunk.channel_list == self.neck.backbone_channel_list
+        ), f"Channel dims of trunk and neck do not match. Trunk: {self.trunk.channel_list}, neck: {self.neck.backbone_channel_list}"
+    def forward(self, sample: torch.Tensor):
+        # Forward through backbone
+        features, pos = self.neck(self.trunk(sample))
+        if self.scalp > 0:
+            # Discard the lowest resolution features
+            features, pos = features[: -self.scalp], pos[: -self.scalp]
+        src = features[-1]
+        output = {
+            "vision_features": src,
+            "vision_pos_enc": pos,
+            "backbone_fpn": features,
+        }
+        return output
+class FpnNeck(nn.Module):
+    """
+    A modified variant of Feature Pyramid Network (FPN) neck
+    (we remove output conv and also do bicubic interpolation similar to ViT
+    pos embed interpolation)
+    """
+    def __init__(
+        self,
+        position_encoding: nn.Module,
+        d_model: int,
+        backbone_channel_list: List[int],
+        kernel_size: int = 1,
+        stride: int = 1,
+        padding: int = 0,
+        fpn_interp_model: str = "bilinear",
+        fuse_type: str = "sum",
+        fpn_top_down_levels: Optional[List[int]] = None,
+    ):
+        """Initialize the neck
+        :param trunk: the backbone
+        :param position_encoding: the positional encoding to use
+        :param d_model: the dimension of the model
+        :param neck_norm: the normalization to use
+        """
+        super().__init__()
+        self.position_encoding = position_encoding
+        self.convs = nn.ModuleList()
+        self.backbone_channel_list = backbone_channel_list
+        self.d_model = d_model
+        for dim in backbone_channel_list:
+            current = nn.Sequential()
+            current.add_module(
+                "conv",
+                nn.Conv2d(
+                    in_channels=dim,
+                    out_channels=d_model,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                ),
+            )
+            self.convs.append(current)
+        self.fpn_interp_model = fpn_interp_model
+        assert fuse_type in ["sum", "avg"]
+        self.fuse_type = fuse_type
+        # levels to have top-down features in its outputs
+        # e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3
+        # have top-down propagation, while outputs of level 0 and level 1 have only
+        # lateral features from the same backbone level.
+        if fpn_top_down_levels is None:
+            # default is to have top-down features on all levels
+            fpn_top_down_levels = range(len(self.convs))
+        self.fpn_top_down_levels = list(fpn_top_down_levels)
+    def forward(self, xs: List[torch.Tensor]):
+        out = [None] * len(self.convs)
+        pos = [None] * len(self.convs)
+        assert len(xs) == len(self.convs)
+        # fpn forward pass
+        # see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py
+        prev_features = None
+        # forward in top-down order (from low to high resolution)
+        n = len(self.convs) - 1
+        for i in range(n, -1, -1):
+            x = xs[i]
+            lateral_features = self.convs[n - i](x)
+            if i in self.fpn_top_down_levels and prev_features is not None:
+                top_down_features = F.interpolate(
+                    prev_features.to(dtype=torch.float32),
+                    scale_factor=2.0,
+                    mode=self.fpn_interp_model,
+                    align_corners=(
+                        None if self.fpn_interp_model == "nearest" else False
+                    ),
+                    antialias=False,
+                )
+                prev_features = lateral_features + top_down_features
+                if self.fuse_type == "avg":
+                    prev_features /= 2
+            else:
+                prev_features = lateral_features
+            x_out = prev_features
+            out[i] = x_out
+            pos[i] = self.position_encoding(x_out).to(x_out.dtype)
+        return out, pos

sam2/modeling/backbones/utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Some utilities for backbones, in particular for windowing"""
+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows, (Hp, Wp)
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(
+        B, Hp // window_size, Wp // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self,
+        kernel_size: Tuple[int, ...] = (7, 7),
+        stride: Tuple[int, ...] = (4, 4),
+        padding: Tuple[int, ...] = (3, 3),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x

sam2/modeling/memory_attention.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional
+import torch
+from torch import nn, Tensor
+from sam2.modeling.sam.transformer import RoPEAttention
+from sam2.modeling.sam2_utils import get_activation_fn, get_clones
+class MemoryAttentionLayer(nn.Module):
+    def __init__(
+        self,
+        activation: str,
+        cross_attention: nn.Module,
+        d_model: int,
+        dim_feedforward: int,
+        dropout: float,
+        pos_enc_at_attn: bool,
+        pos_enc_at_cross_attn_keys: bool,
+        pos_enc_at_cross_attn_queries: bool,
+        self_attention: nn.Module,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.dim_feedforward = dim_feedforward
+        self.dropout_value = dropout
+        self.self_attn = self_attention
+        self.cross_attn_image = cross_attention
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation_str = activation
+        self.activation = get_activation_fn(activation)
+        # Where to add pos enc
+        self.pos_enc_at_attn = pos_enc_at_attn
+        self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
+        self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
+    def _forward_sa(self, tgt, query_pos):
+        # Self-Attention
+        tgt2 = self.norm1(tgt)
+        q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
+        tgt2 = self.self_attn(q, k, v=tgt2)
+        tgt = tgt + self.dropout1(tgt2)
+        return tgt
+    def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0):
+        kwds = {}
+        if num_k_exclude_rope > 0:
+            assert isinstance(self.cross_attn_image, RoPEAttention)
+            kwds = {"num_k_exclude_rope": num_k_exclude_rope}
+        # Cross-Attention
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.cross_attn_image(
+            q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
+            k=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
+            v=memory,
+            **kwds,
+        )
+        tgt = tgt + self.dropout2(tgt2)
+        return tgt
+    def forward(
+        self,
+        tgt,
+        memory,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+        num_k_exclude_rope: int = 0,
+    ) -> torch.Tensor:
+        # Self-Attn, Cross-Attn
+        tgt = self._forward_sa(tgt, query_pos)
+        tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
+        # MLP
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+class MemoryAttention(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        pos_enc_at_input: bool,
+        layer: nn.Module,
+        num_layers: int,
+        batch_first: bool = True,  # Do layers expect batch first input?
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.layers = get_clones(layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = nn.LayerNorm(d_model)
+        self.pos_enc_at_input = pos_enc_at_input
+        self.batch_first = batch_first
+    def forward(
+        self,
+        curr: torch.Tensor,  # self-attention inputs
+        memory: torch.Tensor,  # cross-attention inputs
+        curr_pos: Optional[Tensor] = None,  # pos_enc for self-attention inputs
+        memory_pos: Optional[Tensor] = None,  # pos_enc for cross-attention inputs
+        num_obj_ptr_tokens: int = 0,  # number of object pointer *tokens*
+    ):
+        if isinstance(curr, list):
+            assert isinstance(curr_pos, list)
+            assert len(curr) == len(curr_pos) == 1
+            curr, curr_pos = (
+                curr[0],
+                curr_pos[0],
+            )
+        assert (
+            curr.shape[1] == memory.shape[1]
+        ), "Batch size must be the same for curr and memory"
+        output = curr
+        if self.pos_enc_at_input and curr_pos is not None:
+            output = output + 0.1 * curr_pos
+        if self.batch_first:
+            # Convert to batch first
+            output = output.transpose(0, 1)
+            curr_pos = curr_pos.transpose(0, 1)
+            memory = memory.transpose(0, 1)
+            memory_pos = memory_pos.transpose(0, 1)
+        for layer in self.layers:
+            kwds = {}
+            if isinstance(layer.cross_attn_image, RoPEAttention):
+                kwds = {"num_k_exclude_rope": num_obj_ptr_tokens}
+            output = layer(
+                tgt=output,
+                memory=memory,
+                pos=memory_pos,
+                query_pos=curr_pos,
+                **kwds,
+            )
+        normed_output = self.norm(output)
+        if self.batch_first:
+            # Convert back to seq first
+            normed_output = normed_output.transpose(0, 1)
+            curr_pos = curr_pos.transpose(0, 1)
+        return normed_output

sam2/modeling/memory_encoder.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from sam2.modeling.sam2_utils import DropPath, get_clones, LayerNorm2d
+class MaskDownSampler(nn.Module):
+    """
+    Progressively downsample a mask by total_stride, each time by stride.
+    Note that LayerNorm is applied per *token*, like in ViT.
+    With each downsample (by a factor stride**2), channel capacity increases by the same factor.
+    In the end, we linearly project to embed_dim channels.
+    """
+    def __init__(
+        self,
+        embed_dim=256,
+        kernel_size=4,
+        stride=4,
+        padding=0,
+        total_stride=16,
+        activation=nn.GELU,
+    ):
+        super().__init__()
+        num_layers = int(math.log2(total_stride) // math.log2(stride))
+        assert stride**num_layers == total_stride
+        self.encoder = nn.Sequential()
+        mask_in_chans, mask_out_chans = 1, 1
+        for _ in range(num_layers):
+            mask_out_chans = mask_in_chans * (stride**2)
+            self.encoder.append(
+                nn.Conv2d(
+                    mask_in_chans,
+                    mask_out_chans,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                )
+            )
+            self.encoder.append(LayerNorm2d(mask_out_chans))
+            self.encoder.append(activation())
+            mask_in_chans = mask_out_chans
+        self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1))
+    def forward(self, x):
+        return self.encoder(x)
+# Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt)
+class CXBlock(nn.Module):
+    r"""ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(
+        self,
+        dim,
+        kernel_size=7,
+        padding=3,
+        drop_path=0.0,
+        layer_scale_init_value=1e-6,
+        use_dwconv=True,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv2d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=dim if use_dwconv else 1,
+        )  # depthwise conv
+        self.norm = LayerNorm2d(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, 4 * dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = self.norm(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+class Fuser(nn.Module):
+    def __init__(self, layer, num_layers, dim=None, input_projection=False):
+        super().__init__()
+        self.proj = nn.Identity()
+        self.layers = get_clones(layer, num_layers)
+        if input_projection:
+            assert dim is not None
+            self.proj = nn.Conv2d(dim, dim, kernel_size=1)
+    def forward(self, x):
+        # normally x: (N, C, H, W)
+        x = self.proj(x)
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class MemoryEncoder(nn.Module):
+    def __init__(
+        self,
+        out_dim,
+        mask_downsampler,
+        fuser,
+        position_encoding,
+        in_dim=256,  # in_dim of pix_feats
+    ):
+        super().__init__()
+        self.mask_downsampler = mask_downsampler
+        self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1)
+        self.fuser = fuser
+        self.position_encoding = position_encoding
+        self.out_proj = nn.Identity()
+        if out_dim != in_dim:
+            self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1)
+    def forward(
+        self,
+        pix_feat: torch.Tensor,
+        masks: torch.Tensor,
+        skip_mask_sigmoid: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        ## Process masks
+        # sigmoid, so that less domain shift from gt masks which are bool
+        if not skip_mask_sigmoid:
+            masks = F.sigmoid(masks)
+        masks = self.mask_downsampler(masks)
+        ## Fuse pix_feats and downsampled masks
+        # in case the visual features are on CPU, cast them to CUDA
+        pix_feat = pix_feat.to(masks.device)
+        x = self.pix_feat_proj(pix_feat)
+        x = x + masks
+        x = self.fuser(x)
+        x = self.out_proj(x)
+        pos = self.position_encoding(x).to(x.dtype)
+        return {"vision_features": x, "vision_pos_enc": [pos]}

sam2/modeling/position_encoding.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Any, Optional, Tuple
+import numpy as np
+import torch
+from torch import nn
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention Is All You Need paper, generalized to work on images.
+    """
+    def __init__(
+        self,
+        num_pos_feats,
+        temperature: int = 10000,
+        normalize: bool = True,
+        scale: Optional[float] = None,
+    ):
+        super().__init__()
+        assert num_pos_feats % 2 == 0, "Expecting even model width"
+        self.num_pos_feats = num_pos_feats // 2
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+        self.cache = {}
+    def _encode_xy(self, x, y):
+        # The positions are expected to be normalized
+        assert len(x) == len(y) and x.ndim == y.ndim == 1
+        x_embed = x * self.scale
+        y_embed = y * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, None] / dim_t
+        pos_y = y_embed[:, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2
+        ).flatten(1)
+        pos_y = torch.stack(
+            (pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2
+        ).flatten(1)
+        return pos_x, pos_y
+    @torch.no_grad()
+    def encode_boxes(self, x, y, w, h):
+        pos_x, pos_y = self._encode_xy(x, y)
+        pos = torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
+        return pos
+    encode = encode_boxes  # Backwards compatibility
+    @torch.no_grad()
+    def encode_points(self, x, y, labels):
+        (bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
+        assert bx == by and nx == ny and bx == bl and nx == nl
+        pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten())
+        pos_x, pos_y = pos_x.reshape(bx, nx, -1), pos_y.reshape(by, ny, -1)
+        pos = torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2)
+        return pos
+    @torch.no_grad()
+    def forward(self, x: torch.Tensor):
+        cache_key = (x.shape[-2], x.shape[-1])
+        if cache_key in self.cache:
+            return self.cache[cache_key][None].repeat(x.shape[0], 1, 1, 1)
+        y_embed = (
+            torch.arange(1, x.shape[-2] + 1, dtype=torch.float32, device=x.device)
+            .view(1, -1, 1)
+            .repeat(x.shape[0], 1, x.shape[-1])
+        )
+        x_embed = (
+            torch.arange(1, x.shape[-1] + 1, dtype=torch.float32, device=x.device)
+            .view(1, 1, -1)
+            .repeat(x.shape[0], x.shape[-2], 1)
+        )
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        self.cache[cache_key] = pos[0]
+        return pos
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix",
+            scale * torch.randn((2, num_pos_feats)),
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones((h, w), device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
+# Rotary Positional Encoding, adapted from:
+# 1. https://github.com/meta-llama/codellama/blob/main/llama/model.py
+# 2. https://github.com/naver-ai/rope-vit
+# 3. https://github.com/lucidrains/rotary-embedding-torch
+def init_t_xy(end_x: int, end_y: int):
+    t = torch.arange(end_x * end_y, dtype=torch.float32)
+    t_x = (t % end_x).float()
+    t_y = torch.div(t, end_x, rounding_mode="floor").float()
+    return t_x, t_y
+def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
+    freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    t_x, t_y = init_t_xy(end_x, end_y)
+    freqs_x = torch.outer(t_x, freqs_x)
+    freqs_y = torch.outer(t_y, freqs_y)
+    freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
+    freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
+    return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
+    shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_enc(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+    repeat_freqs_k: bool = False,
+):
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = (
+        torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+        if xk.shape[-2] != 0
+        else None
+    )
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    if xk_ is None:
+        # no keys to rotate, due to dropout
+        return xq_out.type_as(xq).to(xq.device), xk
+    # repeat freqs along seq_len dim to match k seq_len
+    if repeat_freqs_k:
+        r = xk_.shape[-2] // xq_.shape[-2]
+        if freqs_cis.is_cuda:
+            freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1)
+        else:
+            # torch.repeat on complex numbers may not be supported on non-CUDA devices
+            # (freqs_cis has 4 dims and we repeat on dim 2) so we use expand + flatten
+            freqs_cis = freqs_cis.unsqueeze(2).expand(-1, -1, r, -1, -1).flatten(2, 3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)

sam2/modeling/sam/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

sam2/modeling/sam/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (150 Bytes). View file

sam2/modeling/sam/__pycache__/mask_decoder.cpython-312.pyc ADDED Viewed

Binary file (12.6 kB). View file

sam2/modeling/sam/__pycache__/prompt_encoder.cpython-312.pyc ADDED Viewed

Binary file (9.44 kB). View file

sam2/modeling/sam/__pycache__/transformer.cpython-312.pyc ADDED Viewed

Binary file (15.3 kB). View file