Spaces:

bartduis
/

rayst3r

Running on Zero

App Files Files Community

bartduis commited on Jun 6

Commit

70d1188

0 Parent(s):

init

Browse files

Files changed (43) hide show

.gitignore +13 -0
LICENSE +109 -0
app.py +199 -0
datasets/generic_loader.py +168 -0
engine.py +139 -0
eval_wrapper/eval.py +425 -0
eval_wrapper/eval_utils.py +125 -0
eval_wrapper/sample_poses.py +100 -0
example_scene/cam2world.pt +0 -0
example_scene/intrinsics.pt +0 -0
extensions/curope/__init__.py +4 -0
extensions/curope/curope.cpp +69 -0
extensions/curope/curope.egg-info/PKG-INFO +10 -0
extensions/curope/curope.egg-info/SOURCES.txt +9 -0
extensions/curope/curope.egg-info/dependency_links.txt +1 -0
extensions/curope/curope.egg-info/top_level.txt +1 -0
extensions/curope/curope2d.py +43 -0
extensions/curope/kernels.cu +108 -0
extensions/curope/setup.py +34 -0
input/cam2world.pt +0 -0
input/intrinsics.pt +0 -0
main.py +198 -0
models/blocks.py +235 -0
models/heads/__init__.py +26 -0
models/heads/dpt_head.py +582 -0
models/heads/linear_head.py +42 -0
models/heads/postprocess.py +80 -0
models/losses.py +257 -0
models/pos_embed.py +156 -0
models/rayquery.py +227 -0
readme.md +112 -0
requirements.txt +18 -0
utils/augmentations.py +184 -0
utils/batch_prep.py +141 -0
utils/collate.py +7 -0
utils/eval.py +20 -0
utils/fusion.py +476 -0
utils/geometry.py +195 -0
utils/misc.py +122 -0
utils/utils.py +82 -0
utils/viz.py +205 -0
xps/train_rayst3r.py +127 -0
xps/util.py +12 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+*.out
+slurm/
+*.pyc
+*.png
+!assets/*.png
+*.mtl
+*.obj
+*.ply
+*.pth
+**/build/**
+*.so
+wandb**
+logs

LICENSE ADDED Viewed

	@@ -0,0 +1,109 @@

+RaySt3R
+SOFTWARE LICENSE AGREEMENT
+ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH
+USE ONLY
+BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO
+THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH
+THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
+This is a license agreement ("Agreement") between your academic institution or non-
+profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie
+Mellon University (called "Licensor" in this Agreement). All rights not specifically
+granted to you in this Agreement are reserved for Licensor.
+RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
+Licensor retains exclusive ownership of any copy of the Software (as defined below)
+licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
+non-transferable license to use the Software for noncommercial research purposes,
+without the right to sublicense, pursuant to the terms and conditions of this Agreement.
+As used in this Agreement, the term "Software" means (i) the actual copy of all or any
+portion of code for program routines made accessible to Licensee by Licensor pursuant to
+this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder
+or subsequently supplied by Licensor, including all or any file structures, programming
+instructions, user interfaces and screen formats and sequences as well as any and all
+documentation and instructions related to it, and (ii) all or any derivatives and/or
+modifications created or made by You to any of the items specified in (i).
+CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to
+Licensor, and as such, Licensee agrees to receive all such materials in confidence and use
+the Software only in accordance with the terms of this Agreement. Licensee agrees to
+use reasonable effort to protect the Software from unauthorized use, reproduction,
+distribution, or publication.
+COPYRIGHT: The Software is owned by Licensor and is protected by United
+States copyright laws and applicable international treaties and/or conventions.
+PERMITTED USES: The Software may be used for your own noncommercial internal
+research purposes. You understand and agree that Licensor is not obligated to implement
+any suggestions and/or feedback you might provide regarding the Software, but to the
+extent Licensor does so, you are not entitled to any compensation related thereto.
+DERIVATIVES: You may create derivatives of or make modifications to the Software,
+however, You agree that all and any such derivatives and modifications will be owned by
+Licensor and become a part of the Software licensed to You under this Agreement. You
+may only use such derivatives and modifications for your own noncommercial internal
+research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement. You must provide to Licensor one copy
+of all such derivatives and modifications in a recognized electronic format by way of
+electronic mail sent to Bardienus Pieter Duisterhof at [email protected]
+within thirty (30) days of the publication date of any publication that relates to any such
+derivatives or modifications. You understand that Licensor is not obligated to distribute
+or otherwise make available any derivatives or modifications provided by You.
+BACKUPS: If Licensee is an organization, it may make that number of copies of the
+Software necessary for internal noncommercial use at a single site within its organization
+provided that all information appearing in or on the original labels, including the
+copyright and trademark notices are copied onto the labels of the copies.
+USES NOT PERMITTED: You may not distribute, copy or use the Software except as
+explicitly permitted herein. Licensee has not been granted any trademark license as part
+of this Agreement and may not use the name or mark "RaySt3R" "Carnegie Mellon" or any renditions thereof without the prior written
+permission of Licensor.
+You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part,
+or provide third parties access to prior or present versions (or any parts thereof) of the
+Software.
+ASSIGNMENT: You may not assign this Agreement or your rights hereunder without
+the prior written consent of Licensor. Any attempted assignment without such consent
+shall be null and void.
+TERM: The term of the license granted by this Agreement is from Licensee's acceptance
+of this Agreement by clicking "I Agree" below or by using the Software until terminated
+as provided below.
+The Agreement automatically terminates without notice if you fail to comply with any
+provision of this Agreement. Licensee may terminate this Agreement by ceasing using
+the Software. Upon any termination of this Agreement, Licensee will delete any and all
+copies of the Software. You agree that all provisions which operate to protect the
+proprietary rights of Licensor shall remain in force should breach occur and that the
+obligation of confidentiality described in this Agreement is binding in perpetuity and, as
+such, survives the term of the Agreement.
+FEE: Provided Licensee abides completely by the terms and conditions of this
+Agreement, there is no fee due to Licensor for Licensee's use of the Software in
+accordance with this Agreement.
+DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS"
+WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF
+PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR
+USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK
+RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND
+RELATED MATERIALS.
+SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is
+provided as part of this Agreement.
+EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent
+permitted under applicable law, Licensor shall not be liable for direct, indirect, special,
+incidental, or consequential damages or lost profits related to Licensee's use of and/or
+inability to use the Software, even if Licensor is advised of the possibility of such
+damage.
+EXPORT REGULATION: You agree to comply with any and all applicable U.S. export
+control laws, regulations, and/or other laws related to the embargoes and sanction
+programs administered by the U.S. Office of Foreign Assets Control. You may not export
+or re-export the technology with individuals or companies on the U.S. Department of
+Commerce, Department of State or Department of Treasury denied party lists
+https://www.trade.gov/consolidated-screening-list . You represent and warrant that
+Licensee is not an individual or company listed on such denied party lists.
+SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid,
+illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the
+validity, legality and enforceability of the remaining provisions shall not in any way be
+affected or impaired thereby.
+NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or
+remedy under this Agreement shall be construed as a waiver of any future or other
+exercise of such right or remedy by Licensor.
+GOVERNING LAW: This Agreement shall be construed and enforced in accordance
+with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws
+principles. You consent to the personal jurisdiction of the courts of this County and
+waive their rights to venue outside of Allegheny County, Pennsylvania.
+ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole
+and entire agreement between Licensee and Licensor as to the matter set forth herein and
+supersedes any previous agreements, understandings, and arrangements between the
+parties relating hereto.

app.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import numpy as np
+import gradio as gr
+import torch
+import rembg
+import trimesh
+from moge.model.v1 import MoGeModel
+from utils.geometry import compute_pointmap
+import os, shutil
+import cv2
+from huggingface_hub import hf_hub_download
+from PIL import Image
+import matplotlib.pyplot as plt
+from eval_wrapper.eval import EvalWrapper, eval_scene
+from torchvision import transforms
+outdir = "/tmp/rayst3r"
+# loading all necessary models
+print("Loading DINOv2 model")
+dino_model = torch.hub.load('facebookresearch/dinov2', "dinov2_vitl14_reg")
+dino_model.eval()
+dino_model.to("cuda")
+print("Loading MoGe model")
+device = torch.device("cuda")
+# Load the model from huggingface hub (or load from local).
+moge_model = MoGeModel.from_pretrained("Ruicheng/moge-vitl").to(device)
+print("Loading RaySt3R model")
+rayst3r_checkpoint = hf_hub_download("bartduis/rayst3r", "rayst3r.pth")
+rayst3r_model = EvalWrapper(rayst3r_checkpoint)
+def depth2uint16(depth):
+    return depth * torch.iinfo(torch.uint16).max / 10.0 # threshold is in m, convert to uint16 value
+def save_tensor_as_png(tensor: torch.Tensor, path: str, dtype: torch.dtype | None = None):
+    if dtype is None:
+        dtype = tensor.dtype
+    Image.fromarray(tensor.to(dtype).cpu().numpy()).save(path)
+def colorize_points_with_turbo_all_dims(points, method='norm',cmap='turbo'):
+    """
+    Assigns colors to 3D points using the 'turbo' colormap based on a scalar computed from all 3 dimensions.
+    Args:
+        points (np.ndarray): (N, 3) array of 3D points.
+        method (str): Method for reducing 3D point to scalar. Options: 'norm', 'pca'.
+    Returns:
+        np.ndarray: (N, 3) RGB colors in [0, 1].
+    """
+    assert points.shape[1] == 3, "Input must be of shape (N, 3)"
+    if method == 'norm':
+        scalar = np.linalg.norm(points, axis=1)
+    elif method == 'pca':
+        # Project onto first principal component
+        mean = points.mean(axis=0)
+        centered = points - mean
+        u, s, vh = np.linalg.svd(centered, full_matrices=False)
+        scalar = centered @ vh[0]  # Project onto first principal axis
+    else:
+        raise ValueError(f"Unknown method '{method}'")
+    # Normalize scalar to [0, 1]
+    scalar_min, scalar_max = scalar.min(), scalar.max()
+    normalized = (scalar - scalar_min) / (scalar_max - scalar_min + 1e-8)
+    # Apply turbo colormap
+    cmap = plt.colormaps.get_cmap(cmap)
+    colors = cmap(normalized)[:, :3]  # Drop alpha
+    return colors
+def prep_for_rayst3r(img,depth_dict,mask):
+    H, W = img.shape[:2]
+    intrinsics = depth_dict["intrinsics"].detach().cpu()
+    intrinsics[0] *= W
+    intrinsics[1] *= H
+    input_dir = os.path.join(outdir, "input")
+    if os.path.exists(input_dir):
+        shutil.rmtree(input_dir)
+    os.makedirs(input_dir, exist_ok=True)
+    # save intrinsics
+    torch.save(intrinsics, os.path.join(input_dir, "intrinsics.pt"))
+    # save depth
+    depth = depth_dict["depth"].cpu()
+    depth = depth2uint16(depth)
+    save_tensor_as_png(depth, os.path.join(input_dir, "depth.png"),dtype=torch.uint16)
+    # save mask as bool
+    save_tensor_as_png(torch.from_numpy(mask).bool(), os.path.join(input_dir, "mask.png"),dtype=torch.bool)
+    # save image
+    save_tensor_as_png(torch.from_numpy(img), os.path.join(input_dir, "rgb.png"))
+def rayst3r_to_glb(img,depth_dict,mask,max_total_points=10e6,rotated=False):
+    prep_for_rayst3r(img,depth_dict,mask)
+    rayst3r_points = eval_scene(rayst3r_model,os.path.join(outdir, "input"),do_filter_all_masks=True,dino_model=dino_model).cpu()
+    # subsample points
+    n_points = min(max_total_points,rayst3r_points.shape[0])
+    rayst3r_points = rayst3r_points[torch.randperm(rayst3r_points.shape[0])[:n_points]].numpy()
+    rayst3r_points[:,1] = -rayst3r_points[:,1]
+    rayst3r_points[:,2] = -rayst3r_points[:,2]
+    # make all points red
+    colors = colorize_points_with_turbo_all_dims(rayst3r_points)
+    # load the input glb
+    scene = trimesh.Scene()
+    pct = trimesh.PointCloud(rayst3r_points, colors=colors, radius=0.01)
+    scene.add_geometry(pct)
+    outfile = os.path.join(outdir, "rayst3r.glb")
+    scene.export(outfile)
+    return outfile
+def input_to_glb(outdir,img,depth_dict,mask,rotated=False):
+    H, W = img.shape[:2]
+    intrinsics = depth_dict["intrinsics"].cpu().numpy()
+    intrinsics[0] *= W
+    intrinsics[1] *= H
+    depth = depth_dict["depth"].cpu().numpy()
+    cam2world = np.eye(4)
+    points_world = compute_pointmap(depth, cam2world, intrinsics)
+    scene = trimesh.Scene()
+    pts = np.concatenate([p[m] for p,m in zip(points_world,mask)])
+    col = np.concatenate([c[m] for c,m in zip(img,mask)])
+    pts = pts.reshape(-1,3)
+    pts[:,1] = -pts[:,1]
+    pts[:,2] = -pts[:,2]
+    pct = trimesh.PointCloud(pts, colors=col.reshape(-1,3))
+    scene.add_geometry(pct)
+    outfile = os.path.join(outdir, "input.glb")
+    scene.export(outfile)
+    return outfile
+def depth_moge(input_img):
+    input_img_torch = torch.tensor(input_img / 255, dtype=torch.float32, device=device).permute(2, 0, 1)
+    output = moge_model.infer(input_img_torch)
+    return output
+def mask_rembg(input_img):
+    #masked_img = rembg.remove(input_img,)
+    output_img = rembg.remove(input_img, alpha_matting=False, post_process_mask=True)
+    # Convert to NumPy array
+    output_np = np.array(output_img)
+    alpha = output_np[..., 3]
+    # Step 2: Erode the alpha mask to shrink object slightly
+    kernel = np.ones((3, 3), np.uint8)  # Adjust size for aggressiveness
+    eroded_alpha = cv2.erode(alpha, kernel, iterations=1)
+    # Step 3: Replace alpha channel
+    output_np[..., 3] = eroded_alpha
+    mask = output_np[:,:,-1] >= 128
+    rgb = output_np[:,:,:3]
+    return mask, rgb
+def process_image(input_img):
+    # resize the input image
+    rotated = False
+    #if input_img.shape[0] > input_img.shape[1]:
+        #input_img = cv2.rotate(input_img, cv2.ROTATE_90_COUNTERCLOCKWISE)
+        #rotated = True
+    input_img = cv2.resize(input_img, (640, 480))
+    mask, rgb = mask_rembg(input_img)
+    depth_dict = depth_moge(input_img)
+    if os.path.exists(outdir):
+        shutil.rmtree(outdir)
+    os.makedirs(outdir)
+    input_glb = input_to_glb(outdir,input_img,depth_dict,mask,rotated=rotated)
+    # visualize the input points in 3D in gradio
+    inference_glb = rayst3r_to_glb(input_img,depth_dict,mask,rotated=rotated)
+    return input_glb, inference_glb
+demo = gr.Interface(
+    process_image,
+    gr.Image(),
+    [gr.Model3D(label="Input"), gr.Model3D(label="RaySt3R",)]
+)
+if __name__ == "__main__":
+    demo.launch()

datasets/generic_loader.py ADDED Viewed

	@@ -0,0 +1,168 @@

+bb = breakpoint
+import torch
+import trimesh
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+from pathlib import Path
+import pickle
+import tqdm
+import json
+from PIL import Image
+class GenericLoader(torch.utils.data.Dataset):
+    def __init__(self,dir="octmae_data/tiny_train/train_processed",seed=747,size=10,datasets=["fp_objaverse"],split="train",dtype=torch.float32,mode="slow",
+                 prefetch_dino=False,dino_features=[23],view_select_mode="new_zoom",noise_std=0.0,rendered_views_mode="None",**kwargs):
+        super().__init__(**kwargs)
+        self.dir = dir
+        self.rng = np.random.default_rng(seed)
+        self.size = size
+        self.datasets = datasets
+        self.split = split
+        self.dtype = dtype
+        self.mode = mode
+        self.prefetch_dino = prefetch_dino
+        self.view_select_mode = view_select_mode
+        self.noise_std = noise_std * torch.iinfo(torch.uint16).max / 10.0 # variance in the range of the depth map, uint16 normalized to 10
+        if self.mode == 'slow':
+            self.prefetch_dino = True
+        self.find_scenes()
+        self.dino_features = dino_features
+        self.rendered_views_mode = rendered_views_mode
+    def find_dataset_location_list(self,dataset):
+        data_dir = None
+        for d in self.dir:
+            datasets = os.listdir(d)
+            if dataset in datasets:
+                if data_dir is not None:
+                    raise ValueError(f"Dataset {dataset} found in multiple locations: {self.dir}")
+                else:
+                    data_dir = os.path.join(d,dataset)
+        if data_dir is None:
+            raise ValueError(f"Dataset {dataset} not found in {self.dir}")
+        return data_dir
+    def find_dataset_location(self,dataset):
+        if isinstance(self.dir,list):
+            data_dir = self.find_dataset_location_list(dataset)
+        else:
+            data_dir = os.path.join(self.dir,dataset)
+            if not os.path.exists(data_dir):
+                raise ValueError(f"Dataset {dataset} not found in {self.dir}")
+        return data_dir
+    def find_scenes(self):
+        all_scenes = {}
+        print("Loading scenes...")
+        for dataset in self.datasets:
+            dataset_dir = self.find_dataset_location(dataset)
+            scenes =  json.load(open(os.path.join(dataset_dir, f"{self.split}_scenes.json")))
+            scene_ids = [dataset + "_" + f.split("/")[-2] + "_" +  f.split("/")[-1] for f in scenes]
+            all_scenes.update(dict(zip(scene_ids, scenes)))
+        self.scenes = all_scenes
+        self.scene_ids = list(self.scenes.keys())
+        # shuffle the scene ids
+        self.rng.shuffle(self.scene_ids)
+        if self.size > 0:
+            self.scene_ids = self.scene_ids[:self.size]
+        self.size = len(self.scene_ids)
+        return scenes
+    def __len__(self):
+        return self.size
+    def decide_context_view(self,cam_dir):
+        # we pick the view furthest away from the origin as the view for conditioning
+        cam_dirs = [d for d in os.listdir(cam_dir) if os.path.isdir(os.path.join(cam_dir,d)) and not d.startswith("gen")] # input cam needs rgb
+        extrinsics = {c:torch.load(os.path.join(cam_dir,c,'cam2world.pt'),map_location='cpu',weights_only=True) for c in cam_dirs}
+        dist_origin = {c:torch.linalg.norm(extrinsics[c][:3,3]) for c in extrinsics}
+        if self.view_select_mode == 'new_zoom':
+            # find the view with the maximum distance to the origin
+            input_cam = max(dist_origin,key=dist_origin.get)
+            # pick another random view to predict, excluding the context view
+        elif self.view_select_mode == 'random':
+            # pick a random view
+            input_cam = str(self.rng.choice(list(dist_origin.keys())))
+            # pick another random view to predict, excluding the context view
+        else:
+            raise ValueError(f"Invalid mode: {self.view_select_mode}")
+        if self.rendered_views_mode == "None":
+            pass
+        elif self.rendered_views_mode == "random":
+            cam_dirs = [d for d in os.listdir(cam_dir) if os.path.isdir(os.path.join(cam_dir,d))]
+        elif self.rendered_views_mode == "always":
+            cam_dirs_gen = [d for d in os.listdir(cam_dir) if os.path.isdir(os.path.join(cam_dir,d)) and d.startswith("gen")]
+            if len(cam_dirs_gen) > 0:
+                cam_dirs = cam_dirs_gen
+        else:
+            raise ValueError(f"Invalid mode: {self.rendered_views_mode}")
+        possible_views = [v for v in cam_dirs if v != input_cam]
+        new_cam = str(self.rng.choice(possible_views))
+        return input_cam,new_cam
+    def transform_pointmap(self,pointmap_cam,c2w):
+        # pointmap: shape H x W x 3
+        # cw2: shape 4 x 4
+        # we want to transform the pointmap to the world frame
+        pointmap_cam_h = torch.cat([pointmap_cam,torch.ones(pointmap_cam.shape[:-1]+(1,)).to(pointmap_cam.device)],dim=-1)
+        pointmap_world_h = pointmap_cam_h @ c2w.T
+        pointmap_world = pointmap_world_h[...,:3]/pointmap_world_h[...,3:4]
+        return pointmap_world
+    def load_scene_slow(self,input_cam,new_cam,cam_dir):
+        data = dict(new_cams={},input_cams={})
+        data['new_cams']['c2ws'] = [torch.load(os.path.join(cam_dir,new_cam,'cam2world.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['new_cams']['depths'] = [torch.load(os.path.join(cam_dir,new_cam,'depth.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['new_cams']['pointmaps'] = [self.transform_pointmap(torch.load(os.path.join(cam_dir,new_cam,'pointmap.pt'),map_location='cpu',weights_only=True).to(self.dtype),data['new_cams']['c2ws'][0])]
+        data['new_cams']['Ks'] = [torch.load(os.path.join(cam_dir,new_cam,'intrinsics.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['new_cams']['valid_masks'] = [torch.load(os.path.join(cam_dir,new_cam,'mask.pt'),map_location='cpu',weights_only=True).to(torch.bool)]
+        # add the context views
+        data['input_cams']['c2ws'] = [torch.load(os.path.join(cam_dir,input_cam,'cam2world.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['input_cams']['depths'] = [torch.load(os.path.join(cam_dir,input_cam,'depth.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['input_cams']['pointmaps'] = [self.transform_pointmap(torch.load(os.path.join(cam_dir,input_cam,'pointmap.pt'),map_location='cpu',weights_only=True).to(self.dtype),data['input_cams']['c2ws'][0])]
+        data['input_cams']['Ks'] = [torch.load(os.path.join(cam_dir,input_cam,'intrinsics.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['input_cams']['valid_masks'] = [torch.load(os.path.join(cam_dir,input_cam,'mask.pt'),map_location='cpu',weights_only=True).to(torch.bool)]
+        data['input_cams']['imgs'] = [torch.load(os.path.join(cam_dir,input_cam,'rgb.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['input_cams']['dino_features'] = [torch.load(os.path.join(cam_dir,input_cam,f'dino_features_layer_{l}.pt'),map_location='cpu',weights_only=True).to(self.dtype) for l in self.dino_features]
+        return data
+    def depth_to_metric(self,depth):
+        # depth: shape H x W
+        # we want to convert the depth to a metric depth
+        depth_max = 10.0
+        depth_scaled = depth_max * (depth / 65535.0)
+        return depth_scaled
+    def load_scene_fast(self,input_cam,new_cam,cam_dir):
+        data = dict(new_cams={},input_cams={})
+        data['new_cams']['c2ws'] = [torch.load(os.path.join(cam_dir,new_cam,'cam2world.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['new_cams']['Ks'] = [torch.load(os.path.join(cam_dir,new_cam,'intrinsics.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['new_cams']['depths'] = [torch.from_numpy(np.array(Image.open(os.path.join(cam_dir,new_cam,'depth.png'))).astype(np.float32))]
+        data['new_cams']['valid_masks'] = [torch.from_numpy(np.array(Image.open(os.path.join(cam_dir,new_cam,'mask.png'))))]
+        data['input_cams']['c2ws'] = [torch.load(os.path.join(cam_dir,input_cam,'cam2world.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['input_cams']['Ks'] = [torch.load(os.path.join(cam_dir,input_cam,'intrinsics.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['input_cams']['depths'] = [torch.from_numpy(np.array(Image.open(os.path.join(cam_dir,input_cam,'depth.png'))).astype(np.float32))]
+        data['input_cams']['valid_masks'] = [torch.from_numpy(np.array(Image.open(os.path.join(cam_dir,input_cam,'mask.png'))))]
+        data['input_cams']['imgs'] = [torch.from_numpy(np.array(Image.open(os.path.join(cam_dir,input_cam,'rgb.png'))))]
+        if self.prefetch_dino:
+            data['input_cams']['dino_features'] = [torch.cat([torch.load(os.path.join(cam_dir,input_cam,f'dino_features_layer_{l}.pt'),map_location='cpu',weights_only=True).to(self.dtype) for l in self.dino_features],dim=-1)]
+        return data
+    def __getitem__(self,idx):
+        cam_dir = os.path.join(self.scenes[self.scene_ids[idx]],'cameras')
+        #data['input_cams'] = {k:[v[0].unsqueeze(0)] for k,v in data['input_cams'].items()}
+        input_cam,new_cam = self.decide_context_view(cam_dir)
+        if self.mode == 'slow':
+            data = self.load_scene_slow(input_cam,new_cam,cam_dir)
+        else:
+            data = self.load_scene_fast(input_cam,new_cam,cam_dir)
+        return data

engine.py ADDED Viewed

	@@ -0,0 +1,139 @@

+bb=breakpoint
+import torch
+from utils.geometry import center_pointmaps, uncenter_pointmaps
+from utils.utils import scenes_to_batch, batch_to_scenes
+from utils.batch_prep import prepare_fast_batch, normalize_batch, denormalize_batch
+from utils.viz import save_pointmaps
+from tqdm import tqdm
+import wandb
+from utils import misc
+from torch.amp import GradScaler
+from utils.eval import eval_pred
+from utils.geometry import depth2pts
+def batch_to_device(batch,device='cuda'):
+    for key in batch:
+        if isinstance(batch[key],torch.Tensor):
+            batch[key] = batch[key].to(device)
+        elif isinstance(batch[key],dict):
+            batch[key] = batch_to_device(batch[key],device)
+    return batch
+def eval_model(model,batch,mode='loss',device='cuda',dino_model=None,args=None,augmentor=None,return_scale=False):
+    batch = batch_to_device(batch,device)
+    # check if model is distributed
+    if isinstance(model,torch.nn.parallel.DistributedDataParallel):
+        dino_layers = model.module.dino_layers
+    else:
+        dino_layers = model.dino_layers
+    if 'pointmaps' not in list(batch['input_cams'].keys()):
+        batch = prepare_fast_batch(batch,dino_model,dino_layers)
+    normalize_mode = args.normalize_mode if args is not None else 'median'
+    batch, scale_factors = normalize_batch(batch,normalize_mode)
+    if augmentor is not None:
+        batch = augmentor(batch)
+    batch, n_cams = scenes_to_batch(batch)
+    batch = center_pointmaps(batch) # centering around first camera
+    device = args.device if args is not None else 'cuda'
+    with torch.amp.autocast(device_type=device, dtype=torch.bfloat16):
+        pred, gt, loss_dict = model(batch,mode='viz')
+    if 'pointmaps' not in list(pred.keys()):
+        pred['pointmaps'] = depth2pts(pred['depths'].squeeze(-1),batch['new_cams']['Ks'])
+    elif 'depths' not in list(pred.keys()):
+        pred['depths'] = pred['pointmaps'][...,-1]
+    loss_dict = {**loss_dict,**eval_pred(pred, gt)}
+    if mode == 'loss':
+        return loss_dict
+    elif mode == 'viz':
+        pred, gt, batch = uncenter_pointmaps(pred, gt, batch)
+        pred, gt, batch = batch_to_scenes(pred, gt,batch, n_cams)
+        if return_scale:
+            return pred, gt, loss_dict, scale_factors[0].item()
+        else:
+            return pred, gt, loss_dict
+    else:
+        raise ValueError(f"Invalid mode: {mode}")
+def update_loss_dict(loss_dict,loss_dict_new,sample_count):
+    for key in loss_dict_new:
+        if key not in loss_dict:
+            loss_dict[key] = loss_dict_new[key]
+        else:
+            # previously stored value in loss_dict is average from sample_count samples
+            # so we need to update it to include the new sample
+            loss_dict[key] = (loss_dict[key] * sample_count + loss_dict_new[key]) / (sample_count + 1)
+    return loss_dict
+def train_epoch(model, train_loader, optimizer, device='cuda', max_norm=1.0,log_wandb=False,epoch=0,batch_size=None,args=None,dino_model=None,augmentor=None):
+    model.train()
+    all_losses_dict = {}
+    sample_idx = epoch * batch_size * len(train_loader)
+    scaler = GradScaler()
+    for i, batch in tqdm(enumerate(train_loader),total=len(train_loader)):
+        optimizer.zero_grad()
+        new_loss_dict = eval_model(model, batch, mode='loss', device=device,dino_model=dino_model,args=args,augmentor=augmentor)
+        loss = new_loss_dict['loss']
+        if loss is None:
+            continue
+        scaler.scale(loss).backward()
+        # Unscales the gradients of optimizer's assigned params in-place
+        scaler.unscale_(optimizer)
+        grad_norm = torch.norm(torch.stack([torch.norm(p.grad) for p in model.parameters() if p.grad is not None]))
+        if grad_norm.isnan():
+            breakpoint()
+        ## Since the gradients of optimizer's assigned params are unscaled, clips as usual:
+        if max_norm > 0:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+        # optimizer's gradients are already unscaled, so scaler.step does not unscale them,
+        # although it still skips optimizer.step() if the gradients contain infs or NaNs.
+        scaler.step(optimizer)
+        # Updates the scale for next iteration.
+        scaler.update()
+        new_loss_dict['grad_norm'] = grad_norm.detach().cpu().item()
+        misc.adjust_learning_rate(optimizer, epoch + i/len(train_loader), args)
+        optimizer.step()
+        new_loss_dict = {k: (v.detach().cpu().item() if isinstance(v, torch.Tensor) else v) for k, v in new_loss_dict.items()}
+        if log_wandb:
+            wandb_dict = {f"train_{k}":v for k,v in new_loss_dict.items()}
+            wandb.log(wandb_dict, step=sample_idx + (i+1)*batch_size)
+            # log learning rate
+            wandb.log({"train_lr": optimizer.param_groups[0]['lr']}, step=sample_idx + (i+1)*batch_size)
+        all_losses_dict = update_loss_dict(all_losses_dict, new_loss_dict,sample_count=i)
+        # Clear cache and delete variables to free memory
+        torch.cuda.empty_cache()
+        del loss
+        del new_loss_dict
+        del grad_norm
+        del batch
+    return all_losses_dict
+def eval_epoch(model,test_loader,device='cuda',dino_model=None,args=None,augmentor=None):
+    model.eval()
+    all_losses_dict = {}
+    with torch.no_grad():
+        for i, batch in tqdm(enumerate(test_loader),total=len(test_loader)):
+            new_loss_dict = eval_model(model,batch,mode='loss',device=device,dino_model=dino_model,args=args,augmentor=augmentor)
+            if new_loss_dict is None:
+                continue
+            all_losses_dict = update_loss_dict(all_losses_dict,new_loss_dict,sample_count=i)
+            torch.cuda.empty_cache()
+            del new_loss_dict
+            del batch
+    return all_losses_dict

eval_wrapper/eval.py ADDED Viewed

	@@ -0,0 +1,425 @@

+from PIL import Image
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from torchvision import transforms
+import os
+import sys
+import open3d as o3d
+current_dir = os.getcwd()
+sys.path.append(current_dir)
+from eval_wrapper.sample_poses import pointmap_to_poses
+from utils.fusion import fuse_batch
+from models.rayquery import *
+from models.losses import *
+import argparse
+from utils import misc
+import torch.distributed as dist
+from utils.collate import collate
+from engine import eval_model
+from utils.viz import just_load_viz
+from utils.geometry import compute_pointmap_torch
+from eval_wrapper.eval_utils import npy2ply, filter_all_masks
+from huggingface_hub import hf_hub_download
+class EvalWrapper(torch.nn.Module):
+    def __init__(self,checkpoint_path,distributed=False,device="cuda",dtype=torch.float32,**kwargs):
+        super().__init__()
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        model_string = checkpoint['args'].model
+        self.model = eval(model_string).to(device)
+        if distributed:
+            rank, world_size, local_rank = misc.setup_distributed()
+            self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[local_rank],find_unused_parameters=True)
+        self.dtype = dtype
+        self.model.load_state_dict(checkpoint['model'])
+        self.model.eval()
+    def forward(self,x,dino_model=None):
+        pred, gt, loss, scale = eval_model(self.model,x,mode='viz',dino_model=dino_model,return_scale=True)
+        return pred, gt, loss, scale
+class PostProcessWrapper(torch.nn.Module):
+    def __init__(self,pred_mask_threshold = 0.5, mode='novel_views',
+    debug=False,conf_dist_mode='isotonic',set_conf=None,percentile=20,
+    no_input_mask=False,no_pred_mask=False):
+        super().__init__()
+        self.pred_mask_threshold = pred_mask_threshold
+        self.mode = mode
+        self.debug = debug
+        self.conf_dist_mode = conf_dist_mode
+        self.set_conf = set_conf
+        self.percentile = percentile
+        self.no_input_mask = no_input_mask
+        self.no_pred_mask = no_pred_mask
+    def transform_pointmap(self,pointmap_cam,c2w):
+        # pointmap: shape H x W x 3
+        # cw2: shape 4 x 4
+        # we want to transform the pointmap to the world frame
+        pointmap_cam_h = torch.cat([pointmap_cam,torch.ones(pointmap_cam.shape[:-1]+(1,)).to(pointmap_cam.device)],dim=-1)
+        pointmap_world_h = pointmap_cam_h @ c2w.T
+        pointmap_world = pointmap_world_h[...,:3]/pointmap_world_h[...,3:4]
+        return pointmap_world
+    def reject_conf_points(self,conf_pts):
+        if self.set_conf is None:
+            raise ValueError("set_conf must be set")
+        conf_mask = conf_pts > self.set_conf
+        return conf_mask
+    def project_input_mask(self,pred_dict,batch):
+        input_mask = batch['input_cams']['original_valid_masks'][0][0] # shape H x W
+        input_c2w = batch['input_cams']['c2ws'][0][0]
+        input_w2c = torch.linalg.inv(input_c2w)
+        input_K = batch['input_cams']['Ks'][0][0]
+        H, W = input_mask.shape
+        pointmaps_input_cam = torch.stack([self.transform_pointmap(pmap,input_w2c@c2w) for pmap,c2w in zip(pred_dict['pointmaps'][0],batch['new_cams']['c2ws'][0])]) # bp: Assuming batch size is 1!!
+        img_coords = pointmaps_input_cam @ input_K.T
+        img_coords = (img_coords[...,:2]/img_coords[...,2:3]).int()
+        n_views, H, W = img_coords.shape[:3]
+        device = input_mask.device
+        if self.no_input_mask:
+            combined_mask = torch.ones((n_views, H, W), device=device)
+        else:
+            combined_mask = torch.zeros((n_views, H, W), device=device)
+            # Flatten spatial dims
+            xs = img_coords[..., 0].view(n_views, -1)  # [V, H*W]
+            ys = img_coords[..., 1].view(n_views, -1)  # [V, H*W]
+            # Create base pixel coords (i, j)
+            i_coords = torch.arange(H, device=device).view(-1, 1).expand(H, W).reshape(-1)  # [H*W]
+            j_coords = torch.arange(W, device=device).view(1, -1).expand(H, W).reshape(-1)  # [H*W]
+            mask_coords = torch.stack((i_coords, j_coords), dim=-1)  # [H*W, 2], shared across views
+            # Mask for valid projections
+            valid = (xs >= 0) & (xs < W) & (ys >= 0) & (ys < H)  # [V, H*W]
+            # Clip out-of-bounds coords for indexing (only valid will be used anyway)
+            xs_clipped = torch.clamp(xs, 0, W-1)
+            ys_clipped = torch.clamp(ys, 0, H-1)
+            # input_mask lookup per view
+            flat_input_mask = input_mask[ys_clipped, xs_clipped]  # [V, H*W]
+            input_mask_mask = flat_input_mask & valid  # apply valid range mask
+            # Apply mask to coords and depths
+            depth_points = pointmaps_input_cam[..., -1].view(n_views, -1)  # [V, H*W]
+            input_depths = batch['input_cams']['depths'][0][0][ys_clipped, xs_clipped]  # [V, H*W]
+            depth_mask = (depth_points > input_depths) & input_mask_mask  # final mask [V, H*W]
+            #depth_mask = input_mask_mask  # final mask [V, H*W]
+            # Get final (i,j) coords to write
+            final_i = mask_coords[:, 0].unsqueeze(0).expand(n_views, -1)[depth_mask]  # [N_mask]
+            final_j = mask_coords[:, 1].unsqueeze(0).expand(n_views, -1)[depth_mask]  # [N_mask]
+            final_view_idx = torch.arange(n_views, device=device).view(-1, 1).expand(-1, H*W)[depth_mask]  # [N_mask]
+            # Scatter final mask
+            combined_mask[final_view_idx, final_i, final_j] = 1
+        return combined_mask.unsqueeze(0).bool()
+    def forward(self,pred_dict,batch):
+        if self.mode == 'novel_views':
+            project_masks = self.project_input_mask(pred_dict,batch)
+            pred_mask_raw = torch.sigmoid(pred_dict['classifier'])
+            if self.no_pred_mask:
+                pred_masks = torch.ones_like(project_masks).bool()
+            else:
+                pred_masks = (pred_mask_raw > self.pred_mask_threshold).bool()
+            conf_masks = self.reject_conf_points(pred_dict['conf_pointmaps'])
+            combined_mask = project_masks & pred_masks & conf_masks
+            batch['new_cams']['valid_masks'] = combined_mask
+        elif self.mode == 'input_view':
+            conf_masks = self.reject_conf_points(pred_dict['conf_pointmaps'])
+            if self.no_pred_mask:
+                pred_masks = torch.ones_like(conf_masks).bool()
+            else:
+                pred_mask_raw = torch.sigmoid(pred_dict['classifier'])
+                pred_masks = (pred_mask_raw > self.pred_mask_threshold).bool()
+            combined_mask = conf_masks & batch['new_cams']['valid_masks'] & pred_masks
+            batch['new_cams']['valid_masks'] = combined_mask # this is for visualization
+        return pred_dict, batch
+class GenericLoaderSmall(torch.utils.data.Dataset):
+    def __init__(self,data_dir,mode="single_scene",dtype=torch.float32,n_pred_views=3,pred_input_only=False,min_depth=0.1,
+    pointmap_for_bb=None,run_octmae=False,false_positive=None,false_negative=None):
+        self.data_dir = data_dir
+        self.mode = mode
+        self.dtype = dtype
+        self.rng = np.random.RandomState(seed=42)
+        self.n_pred_views = n_pred_views
+        self.min_depth = self.depth_metric_to_uint16(min_depth)
+        if self.mode == "single_scene":
+            self.inputs = [data_dir]
+        self.pred_input_only = pred_input_only
+        if self.pred_input_only:
+            self.n_pred_views = 1
+        self.desired_resolution = (480,640)
+        self.resize_transform_rgb = transforms.Resize(self.desired_resolution)
+        self.resize_transform_depth = transforms.Resize(self.desired_resolution,interpolation=transforms.InterpolationMode.NEAREST)
+        self.pointmap_for_bb = pointmap_for_bb
+        self.run_octmae = run_octmae
+        self.false_positive = false_positive
+        self.false_negative = false_negative
+    def transform_pointmap(self,pointmap_cam,c2w):
+        # pointmap: shape H x W x 3
+        # cw2: shape 4 x 4
+        # we want to transform the pointmap to the world frame
+        pointmap_cam_h = torch.cat([pointmap_cam,torch.ones(pointmap_cam.shape[:-1]+(1,)).to(pointmap_cam.device)],dim=-1)
+        pointmap_world_h = pointmap_cam_h @ c2w.T
+        pointmap_world = pointmap_world_h[...,:3]/pointmap_world_h[...,3:4]
+        return pointmap_world
+    def __len__(self):
+        return len(self.inputs)
+    def look_at(self,cam_pos, center=(0,0,0), up=(0,0,1)):
+        z = center - cam_pos
+        z /= np.linalg.norm(z, axis=-1, keepdims=True)
+        y = -np.float32(up)
+        y = y - np.sum(y * z, axis=-1, keepdims=True) * z
+        y /= np.linalg.norm(y, axis=-1, keepdims=True)
+        x = np.cross(y, z, axis=-1)
+        cam2w = np.r_[np.c_[x,y,z,cam_pos],[[0,0,0,1]]]
+        return cam2w.astype(np.float32)
+    def find_new_views(self,n_views,geometric_median = (0,0,0),r_min=0.4,r_max=0.9):
+        rad = self.rng.uniform(r_min,r_max, size=n_views)
+        azi = self.rng.uniform(0, 2*np.pi, size=n_views)
+        ele = self.rng.uniform(-np.pi, np.pi, size=n_views)
+        cam_centers = np.c_[np.cos(azi), np.sin(azi)]
+        cam_centers = rad[:,None] * np.c_[np.cos(ele)[:,None]*cam_centers, np.sin(ele)] + geometric_median
+        c2ws = [self.look_at(cam_pos=cam_center,center=geometric_median) for cam_center in cam_centers]
+        return c2ws
+    def depth_uint16_to_metric(self,depth):
+        return depth / torch.iinfo(torch.uint16).max * 10.0 # threshold is in m, convert to uint16 value
+    def depth_metric_to_uint16(self,depth):
+        return depth * torch.iinfo(torch.uint16).max / 10.0 # threshold is in m, convert to uint16 value
+    def resize(self,depth,img,mask,K):
+        s_x = self.desired_resolution[1] / img.shape[1]
+        s_y = self.desired_resolution[0] / img.shape[0]
+        depth = self.resize_transform_depth(depth.unsqueeze(0)).squeeze(0)
+        img = self.resize_transform_rgb(img.permute(-1,0,1)).permute(1,2,0)
+        mask = self.resize_transform_depth(mask.unsqueeze(0)).squeeze(0)
+        K[0] *= s_x
+        K[1] *= s_y
+        return depth, img, mask, K
+    def add_false_positives_and_negatives(self,valid_mask,false_positive,false_negative):
+        # add false positives to the valid mask
+        # add false negatives to the valid mask
+        # return the new valid mask
+        n_total_pixels = valid_mask.sum()
+        n_pixels_left = n_total_pixels * (1-false_positive)
+        mask_pixels_coords = torch.where(valid_mask)
+        left_pixels_coords = torch.where(~valid_mask)
+        # false positives
+        n_false_positives = min(int(n_pixels_left * false_positive),n_pixels_left)
+        # randomly sample n_false_positives from mask_pixels_coords
+        false_positives = torch.randperm(len(left_pixels_coords[0]))[:n_false_positives]
+        valid_mask[left_pixels_coords[0][false_positives],left_pixels_coords[1][false_positives]] = 1
+        # false negatives
+        n_false_negatives = min(int(n_total_pixels * false_negative),n_total_pixels)
+        # randomly sample n_false_negatives from left_pixels_coords
+        false_negatives = torch.randperm(len(mask_pixels_coords[0]))[:n_false_negatives]
+        valid_mask[mask_pixels_coords[0][false_negatives],mask_pixels_coords[1][false_negatives]] = 0
+        return valid_mask
+    def __getitem__(self,idx):
+        scene_dir = self.inputs[idx]
+        data = dict(new_cams={},input_cams={})
+        c2w_path = os.path.join(scene_dir,'cam2world.pt')
+        if os.path.exists(c2w_path):
+            data['input_cams']['c2ws_original'] = [torch.load(c2w_path,map_location='cpu',weights_only=True).to(self.dtype)]
+        else:
+            data['input_cams']['c2ws_original'] = [torch.eye(4).to(self.dtype)]
+        data['input_cams']['c2ws'] = [torch.eye(4).to(self.dtype)]
+        data['input_cams']['Ks'] = [torch.load(os.path.join(scene_dir,'intrinsics.pt'),map_location='cpu',weights_only=True).to(self.dtype)]
+        data['input_cams']['depths'] = [torch.from_numpy(np.array(Image.open(os.path.join(scene_dir,'depth.png'))).astype(np.float32))]
+        data['input_cams']['valid_masks'] = [torch.from_numpy(np.array(Image.open(os.path.join(scene_dir,'mask.png')))).bool()]
+        data['input_cams']['imgs'] = [torch.from_numpy(np.array(Image.open(os.path.join(scene_dir,'rgb.png'))))]
+        if self.false_positive is not None or self.false_negative is not None:
+            data['input_cams']['valid_masks'][0] = self.add_false_positives_and_negatives(data['input_cams']['valid_masks'][0],self.false_positive,self.false_negative)
+        if data['input_cams']['depths'][0].shape != self.desired_resolution:
+            data['input_cams']['depths'][0], data['input_cams']['imgs'][0], data['input_cams']['valid_masks'][0], data['input_cams']['Ks'][0] = \
+            self.resize(data['input_cams']['depths'][0], data['input_cams']['imgs'][0], data['input_cams']['valid_masks'][0], data['input_cams']['Ks'][0])
+        data['input_cams']['original_valid_masks'] = [data['input_cams']['valid_masks'][0].clone()]
+        data['input_cams']['valid_masks'][0] = data['input_cams']['valid_masks'][0] & \
+            (data['input_cams']['depths'][0] > self.min_depth)
+        if self.pred_input_only:
+            c2ws = [data['input_cams']['c2ws'][0].cpu().numpy()]
+        else:
+            input_mask = data['input_cams']['valid_masks'][0]
+            if self.pointmap_for_bb is not None:
+                pointmap_input = self.pointmap_for_bb
+            else:
+                pointmap_input = compute_pointmap_torch(self.depth_uint16_to_metric(data['input_cams']['depths'][0]),data['input_cams']['c2ws'][0],data['input_cams']['Ks'][0],device='cpu')[input_mask]
+            c2ws = pointmap_to_poses(pointmap_input, self.n_pred_views, inner_radius=1.1, outer_radius=2.5, device='cpu',run_octmae=self.run_octmae)
+            self.n_pred_views = len(c2ws)
+        data['new_cams'] = {}
+        data['new_cams']['c2ws'] = [torch.from_numpy(c2w).to(self.dtype) for c2w in c2ws]
+        data['new_cams']['depths'] = [torch.zeros_like(data['input_cams']['depths'][0]) for _ in range(self.n_pred_views)]
+        data['new_cams']['Ks'] = [data['input_cams']['Ks'][0] for _ in range(self.n_pred_views)]
+        if self.pred_input_only:
+            data['new_cams']['valid_masks'] = data['input_cams']['original_valid_masks']
+        else:
+            data['new_cams']['valid_masks'] = [torch.ones_like(data['input_cams']['valid_masks'][0]) for _ in range(self.n_pred_views)]
+        return data
+def dict_to_float(d):
+    return {k: v.float() for k, v in d.items()}
+def merge_dicts(d1,d2):
+    # stack the tensors along dimension 1
+    for k,v in d1.items():
+        d1[k] = torch.cat([d1[k],d2[k]],dim=1)
+    return d1
+def compute_all_points(pred_dict,batch):
+    n_views = pred_dict['depths'].shape[1]
+    all_points = None
+    for i in range(n_views):
+        mask = batch['new_cams']['valid_masks'][0,i]
+        pointmap = compute_pointmap_torch(pred_dict['depths'][0,i],batch['new_cams']['c2ws'][0,i],batch['new_cams']['Ks'][0,i])
+        masked_points = pointmap[mask]
+        if all_points is None:
+            all_points = masked_points
+        else:
+            all_points = torch.cat([all_points,masked_points],dim=0)
+    return all_points
+def eval_scene(model, data_dir,visualize=False,rr_addr=None,run_octmae=False,set_conf=5,
+               no_input_mask=False,no_pred_mask=False,no_filter_input_view=False,false_positive=None,false_negative=None,n_pred_views=5,
+               do_filter_all_masks=False, dino_model=None,tsdf=False):
+    if dino_model is None:
+        # Loading DINOv2 model
+        dino_model = torch.hub.load('facebookresearch/dinov2', "dinov2_vitl14_reg")
+        dino_model.eval()
+        dino_model.to("cuda")
+    dataloader_input_view = GenericLoaderSmall(data_dir,n_pred_views=1,pred_input_only=True,false_positive=false_positive,false_negative=false_negative)
+    input_view_loader = DataLoader(dataloader_input_view, batch_size=1, shuffle=True, collate_fn=collate)
+    input_view_batch = next(iter(input_view_loader))
+    postprocessor_input_view = PostProcessWrapper(mode='input_view',set_conf=set_conf,
+                                                  no_input_mask=no_input_mask,no_pred_mask=no_pred_mask)
+    postprocessor_pred_views = PostProcessWrapper(mode='novel_views',debug=False,set_conf=set_conf,
+                                                  no_input_mask=no_input_mask,no_pred_mask=no_pred_mask)
+    fused_meshes = None
+    with torch.no_grad():
+        pred_input_view, gt_input_view, _, scale_factor = model(input_view_batch,dino_model)
+        if no_filter_input_view:
+            pred_input_view['pointmaps'] = input_view_batch['input_cams']['pointmaps']
+            pred_input_view['depths'] = input_view_batch['input_cams']['depths']
+        else:
+            pred_input_view, input_view_batch = postprocessor_input_view(pred_input_view,input_view_batch)
+        input_points = pred_input_view['pointmaps'][0][0][input_view_batch['new_cams']['valid_masks'][0][0]] * (1.0/scale_factor)
+        if input_points.shape[0] == 0:
+            input_points = None
+        dataloader_pred_views = GenericLoaderSmall(data_dir,n_pred_views=n_pred_views,pred_input_only=False,
+        pointmap_for_bb=input_points,run_octmae=run_octmae)
+        pred_views_loader = DataLoader(dataloader_pred_views, batch_size=1, shuffle=True, collate_fn=collate)
+        pred_views_batch = next(iter(pred_views_loader))
+        # this is for the mask ablation
+        if (false_positive is not None or false_negative is not None) and input_points is not None:
+            pred_views_batch['input_cams']['valid_masks'] = input_view_batch['input_cams']['valid_masks']
+        pred_new_views, gt_new_views, _, scale_factor = model(pred_views_batch,dino_model)
+        pred_new_views, pred_views_batch = postprocessor_pred_views(pred_new_views,pred_views_batch)
+    pred = merge_dicts(dict_to_float(pred_input_view),dict_to_float(pred_new_views))
+    gt = merge_dicts(dict_to_float(gt_input_view),dict_to_float(gt_new_views))
+    batch = copy.deepcopy(input_view_batch)
+    batch['new_cams'] = merge_dicts(input_view_batch['new_cams'],pred_views_batch['new_cams'])
+    gt['pointmaps'] = None # make sure it's not used in viz
+    if do_filter_all_masks:
+        batch = filter_all_masks(pred,input_view_batch,max_outlier_views=1)
+    # scale factor is the scale we applied to the input view for inference
+    all_points = compute_all_points(pred,batch)
+    all_points = all_points*(1.0/scale_factor)
+    # transform all_points to the original coordinate system
+    all_points_h = torch.cat([all_points,torch.ones(all_points.shape[:-1]+(1,)).to(all_points.device)],dim=-1)
+    all_points_original = all_points_h @ batch['input_cams']['c2ws_original'][0][0].T
+    all_points = all_points_original[...,:3]
+    # uncomment this to visualize a simple TSDF
+    if tsdf:
+        fused_meshes = fuse_batch(pred,gt,batch,voxel_size=0.002)
+    else:
+        fused_meshes = None
+    if visualize:
+        just_load_viz(pred, gt, batch, addr=rr_addr,fused_meshes=fused_meshes)
+    return all_points
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("data_dir", type=str)
+    parser.add_argument("--rr_addr", type=str, default="0.0.0.0:"+os.getenv("RERUN_RECORDING","9876"))
+    parser.add_argument("--visualize", action="store_true", default=False)
+    parser.add_argument("--run_octmae", action="store_true", default=False)
+    parser.add_argument("--set_conf", type=float, default=5)
+    parser.add_argument("--n_pred_views", type=int, default=5)
+    parser.add_argument("--filter_all_masks", action="store_true", default=False)
+    parser.add_argument("--tsdf", action="store_true", default=False)
+    # ablation settings
+    parser.add_argument("--no_input_mask", action="store_true", default=False)
+    parser.add_argument("--no_pred_mask", action="store_true", default=False)
+    parser.add_argument("--no_filter_input_view", action="store_true", default=False)
+    parser.add_argument("--false_positive", type=float, default=None)
+    parser.add_argument("--false_negative", type=float, default=None)
+    args = parser.parse_args()
+    print("Loading checkpoint from Huggingface")
+    rayst3r_checkpoint = hf_hub_download("bartduis/rayst3r", "rayst3r.pth")
+    model = EvalWrapper(rayst3r_checkpoint,distributed=False)
+    all_points = eval_scene(model, args.data_dir,visualize=args.visualize,rr_addr=args.rr_addr,run_octmae=args.run_octmae,set_conf=args.set_conf,
+                            no_input_mask=args.no_input_mask,no_pred_mask=args.no_pred_mask,no_filter_input_view=args.no_filter_input_view,false_positive=args.false_positive,
+                            false_negative=args.false_negative,n_pred_views=args.n_pred_views,
+                            do_filter_all_masks=args.filter_all_masks,tsdf=args.tsdf).cpu().numpy()
+    all_points_save = os.path.join(args.data_dir,"inference_points.ply")
+    o3d_pc = npy2ply(all_points,colors=None,normals=None)
+    o3d.io.write_point_cloud(all_points_save, o3d_pc)
+if __name__ == "__main__":
+    main()

eval_wrapper/eval_utils.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import norm, lognorm
+import torch
+import open3d as o3d
+def colorize_points_with_turbo_all_dims(points, method='norm',cmap='turbo'):
+    """
+    Assigns colors to 3D points using the 'turbo' colormap based on a scalar computed from all 3 dimensions.
+    Args:
+        points (np.ndarray): (N, 3) array of 3D points.
+        method (str): Method for reducing 3D point to scalar. Options: 'norm', 'pca'.
+    Returns:
+        np.ndarray: (N, 3) RGB colors in [0, 1].
+    """
+    assert points.shape[1] == 3, "Input must be of shape (N, 3)"
+    if method == 'norm':
+        scalar = np.linalg.norm(points, axis=1)
+    elif method == 'pca':
+        # Project onto first principal component
+        mean = points.mean(axis=0)
+        centered = points - mean
+        u, s, vh = np.linalg.svd(centered, full_matrices=False)
+        scalar = centered @ vh[0]  # Project onto first principal axis
+    else:
+        raise ValueError(f"Unknown method '{method}'")
+    # Normalize scalar to [0, 1]
+    scalar_min, scalar_max = scalar.min(), scalar.max()
+    normalized = (scalar - scalar_min) / (scalar_max - scalar_min + 1e-8)
+    # Apply turbo colormap
+    cmap = plt.colormaps.get_cmap(cmap)
+    colors = cmap(normalized)[:, :3]  # Drop alpha
+    return colors
+def npy2ply(points,colors=None,normals=None):
+  cloud = o3d.geometry.PointCloud()
+  cloud.points = o3d.utility.Vector3dVector(points.astype(np.float64))
+  # compute the normals
+  if colors is not None:
+    if colors.max()>1:
+      colors = colors/255.0
+    cloud.colors = o3d.utility.Vector3dVector(colors.astype(np.float64))
+  else:
+    colors = colorize_points_with_turbo_all_dims(points)
+    cloud.colors = o3d.utility.Vector3dVector(colors.astype(np.float64))
+  if normals is not None:
+    cloud.normals = o3d.utility.Vector3dVector(normals.astype(np.float64))
+  return cloud
+def transform_pointmap(pointmap_cam,c2w):
+    # pointmap: shape H x W x 3
+    # cw2: shape 4 x 4
+    # we want to transform the pointmap to the world frame
+    pointmap_cam_h = torch.cat([pointmap_cam,torch.ones(pointmap_cam.shape[:-1]+(1,)).to(pointmap_cam.device)],dim=-1)
+    pointmap_world_h = pointmap_cam_h @ c2w.T
+    pointmap_world = pointmap_world_h[...,:3]/pointmap_world_h[...,3:4]
+    return pointmap_world
+def filter_all_masks(pred_dict, batch, max_outlier_views=1):
+    pred_masks = (torch.sigmoid(pred_dict['classifier'][0]).float() < 0.5).bool()  # [V, H, W]
+    n_views, H, W = pred_masks.shape
+    device = pred_masks.device
+    K = batch['input_cams']['Ks'][0][0]  # [3, 3]
+    c2ws = batch['new_cams']['c2ws'][0]  # [V, 4, 4]
+    w2cs = torch.linalg.inv(c2ws)        # [V, 4, 4]
+    pointmaps = pred_dict['pointmaps'][0]  # [V, H, W, 3]
+    pointmaps_h = torch.cat([pointmaps, torch.ones_like(pointmaps[..., :1])], dim=-1)  # [V, H, W, 4]
+    visibility_count = torch.zeros((n_views, H, W), dtype=torch.int32, device=device)
+    for j in range(n_views):
+        # Project pointmap j to all other views i ≠ j
+        pmap_h = pointmaps_h[j]  # [H, W, 4], world-space points from view j
+        pmap_h = pmap_h.view(1, H, W, 4).expand(n_views, -1, -1, -1)  # [V, H, W, 4]
+        # Compute T_{i←j} = w2cs[i] @ c2ws[j]
+        T = w2cs @ c2ws[j]  # [V, 4, 4]
+        T = T.view(n_views, 1, 1, 4, 4)  # [V, 1, 1, 4, 4]
+        # Transform to i-th camera frame
+        pts_cam = torch.matmul(T, pmap_h.unsqueeze(-1)).squeeze(-1)[..., :3]  # [V, H, W, 3]
+        # Project to image
+        img_coords = torch.matmul(pts_cam, K.T)  # [V, H, W, 3]
+        img_coords = img_coords[..., :2] / img_coords[..., 2:3].clamp(min=1e-6)
+        img_coords = img_coords.round().long()  # [V, H, W, 2]
+        x = img_coords[..., 0].clamp(0, W - 1)
+        y = img_coords[..., 1].clamp(0, H - 1)
+        valid = (img_coords[..., 0] >= 0) & (img_coords[..., 0] < W) & \
+                (img_coords[..., 1] >= 0) & (img_coords[..., 1] < H)
+        # Get depth of the reprojected point from j into i
+        reprojected_depth = pts_cam[..., 2]  # [V, H, W]
+        # Get depth of each view's original pointmap
+        target_depth = pointmaps[:, :, :, 2]  # [V, H, W]
+        # Lookup the depth value in view i at the projected location (x, y)
+        depth_at_pixel = target_depth[torch.arange(n_views).view(-1, 1, 1), y, x]  # [V, H, W]
+        # Check that the point is in front (closest along ray)
+        is_closest = reprojected_depth < depth_at_pixel  # [V, H, W]
+        # Lookup mask values at projected location
+        projected_mask = pred_masks[torch.arange(n_views).view(-1, 1, 1), y, x] & valid  # [V, H, W]
+        # Only consider as visible if it’s within mask and closest point
+        visible = projected_mask & is_closest  # [V, H, W]
+        # Count how many views see each pixel from j
+        visibility_count[j] = visible.sum(dim=0)
+    visibility_mask = (visibility_count <= max_outlier_views).bool()
+    batch['new_cams']['valid_masks'] = visibility_mask & batch['new_cams']['valid_masks']
+    return batch

eval_wrapper/sample_poses.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import numpy as np
+import torch
+import open3d as o3d
+def look_at(cam_pos, target=(0,0,0), up=(0,0,1)):
+    # Forward vector
+    forward = target - cam_pos
+    forward /= np.linalg.norm(forward)
+    # Default up vector
+    right = np.cross(up, forward)
+    if np.linalg.norm(right) < 1e-6:
+        up = np.array([1, 0, 0])
+        right = np.cross(up, forward)
+    right /= np.linalg.norm(right)
+    up = np.cross(forward, right)
+    # Build rotation and translation matrices
+    rotation = np.eye(4)
+    rotation[:3, :3] = np.vstack([right, up, -forward]).T
+    translation = np.eye(4)
+    translation[:3, 3] = cam_pos
+    cam_to_world = translation @ rotation
+    cam_to_world[:3,2] = -cam_to_world[:3,2]
+    cam_to_world[:3,1] = -cam_to_world[:3,1]
+    # rotate 90 degrees around z axis
+    return cam_to_world
+def sample_camera_poses(target: np.ndarray, inner_radius: float, outer_radius: float, n: int,seed: int = 42,mode: str = 'grid') -> np.ndarray:
+    """
+    Samples `n` camera poses uniformly on a sphere of given `radius` around `target`.
+    The cameras are positioned randomly and oriented to look at `target`.
+    Args:
+        target (np.ndarray): 3D point (x, y, z) that cameras should look at.
+        inner_radius (float): Radius of the sphere.
+        outer_radius (float): Radius of the sphere.
+        n (int): Number of camera poses to sample.
+    Returns:
+        torch.Tensor: (n, 4, 4) array of transformation matrices (camera-to-world).
+    """
+    cameras = []
+    np.random.seed(seed)
+    u_1 = np.linspace(0,1,n,endpoint=False)
+    u_2 = np.linspace(0,0.7,n)
+    u_1, u_2 = np.meshgrid(u_1, u_2)
+    u_1 = u_1.flatten()
+    u_2 = u_2.flatten()
+    theta = np.arccos(1-2*u_2)
+    phi = 2*np.pi*u_1
+    n_poses = len(phi)
+    radii = np.random.uniform(inner_radius, outer_radius, n_poses)
+    cameras = []
+    r_z = np.array([[0,-1,0],[1,0,0],[0,0,1]])
+    for i in range(n_poses):
+        # Camera position on the sphere
+        x = target[0] + radii[i] * np.sin(theta[i]) * np.cos(phi[i])
+        y = target[1] + radii[i] * np.sin(theta[i]) * np.sin(phi[i])
+        z = target[2] + radii[i] * np.cos(theta[i])
+        cam_pos = np.array([x, y, z])
+        cam2world = look_at(cam_pos, target)
+        if theta[i] == 0:
+            cam2world[:3,:3] = cam2world[:3,:3] @ r_z # rotate 90 degrees around z axis for the camera opposite to the input
+        cameras.append(cam2world)
+    cameras = np.unique(cameras, axis=0)
+    return np.stack(cameras)
+def pointmap_to_poses(pointmaps: torch.Tensor, n_poses: int, inner_radius: float = 1.1, outer_radius: float = 2.5, device: str = 'cuda',
+bb_mode: str='bb',run_octmae: bool = False) -> np.ndarray:
+    """
+    Samples `n_poses` camera poses uniformly on a sphere of given `radius` around `target`.
+    The cameras are positioned randomly and oriented to look at `target`.
+    """
+    bb_min_corner = pointmaps.min(dim=0)[0].cpu().numpy()
+    bb_max_corner = pointmaps.max(dim=0)[0].cpu().numpy()
+    center = (bb_min_corner + bb_max_corner) / 2    #inner_radius = inner_radius * np.linalg.norm(bb_max_corner - bb_min_corner) / 2 # minimum radius is scalar multiple of bounding box radius
+    bb_radius = np.linalg.norm(bb_max_corner - bb_min_corner) / 2
+    cam2center_dist = np.linalg.norm(center)
+    if run_octmae:
+        radius = max(1.2*cam2center_dist,2.5*bb_radius)
+    else:
+        radius = max(0.7*cam2center_dist,1.3*bb_radius)
+    inner_radius = radius
+    outer_radius = radius
+    camera_poses = sample_camera_poses(center, inner_radius, outer_radius, n_poses)
+    return camera_poses

example_scene/cam2world.pt ADDED Viewed

Binary file (1.25 kB). View file

example_scene/intrinsics.pt ADDED Viewed

Binary file (1.2 kB). View file

extensions/curope/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+from .curope2d import cuRoPE2D

extensions/curope/curope.cpp ADDED Viewed

	@@ -0,0 +1,69 @@

+/*
+  Copyright (C) 2022-present Naver Corporation. All rights reserved.
+  Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+#include <torch/extension.h>
+// forward declaration
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd );
+void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd )
+{
+    const int B = tokens.size(0);
+    const int N = tokens.size(1);
+    const int H = tokens.size(2);
+    const int D = tokens.size(3) / 4;
+    auto tok = tokens.accessor<float, 4>();
+    auto pos = positions.accessor<int64_t, 3>();
+    for (int b = 0; b < B; b++) {
+      for (int x = 0; x < 2; x++) { // y and then x (2d)
+        for (int n = 0; n < N; n++) {
+            // grab the token position
+            const int p = pos[b][n][x];
+            for (int h = 0; h < H; h++) {
+                for (int d = 0; d < D; d++) {
+                    // grab the two values
+                    float u = tok[b][n][h][d+0+x*2*D];
+                    float v = tok[b][n][h][d+D+x*2*D];
+                    // grab the cos,sin
+                    const float inv_freq = fwd * p / powf(base, d/float(D));
+                    float c = cosf(inv_freq);
+                    float s = sinf(inv_freq);
+                    // write the result
+                    tok[b][n][h][d+0+x*2*D] = u*c - v*s;
+                    tok[b][n][h][d+D+x*2*D] = v*c + u*s;
+                }
+            }
+        }
+      }
+    }
+}
+void rope_2d( torch::Tensor tokens,     // B,N,H,D
+        const torch::Tensor positions,  // B,N,2
+        const float base,
+        const float fwd )
+{
+    TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions");
+    TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions");
+    TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions");
+    TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions");
+    TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2");
+    TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" );
+    if (tokens.is_cuda())
+        rope_2d_cuda( tokens, positions, base, fwd );
+    else
+        rope_2d_cpu( tokens, positions, base, fwd );
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward");
+}

extensions/curope/curope.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,10 @@

+Metadata-Version: 2.1
+Name: curope
+Version: 0.0.0
+Summary: UNKNOWN
+Home-page: UNKNOWN
+License: UNKNOWN
+Platform: UNKNOWN
+UNKNOWN

extensions/curope/curope.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+__init__.py
+curope.cpp
+curope2d.py
+kernels.cu
+setup.py
+curope.egg-info/PKG-INFO
+curope.egg-info/SOURCES.txt
+curope.egg-info/dependency_links.txt
+curope.egg-info/top_level.txt

extensions/curope/curope.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

extensions/curope/curope.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ curope

extensions/curope/curope2d.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+import torch
+try:
+    import curope as _kernels # run `python setup.py install`
+except ModuleNotFoundError:
+    from . import curope as _kernels # run `python setup.py build_ext --inplace`
+from torch.amp import custom_fwd, custom_bwd
+class cuRoPE2D_func (torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(device_type='cuda', cast_inputs=torch.float32)
+    def forward(ctx, tokens, positions, base, F0=1):
+        ctx.save_for_backward(positions)
+        ctx.saved_base = base
+        ctx.saved_F0 = F0
+        # tokens = tokens.clone() # uncomment this if inplace doesn't work
+        _kernels.rope_2d( tokens, positions, base, F0 )
+        ctx.mark_dirty(tokens)
+        return tokens
+    @staticmethod
+    @custom_bwd(device_type='cuda')
+    def backward(ctx, grad_res):
+        positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0
+        _kernels.rope_2d( grad_res, positions, base, -F0 )
+        ctx.mark_dirty(grad_res)
+        return grad_res, None, None, None
+class cuRoPE2D(torch.nn.Module):
+    def __init__(self, freq=100.0, F0=1.0):
+        super().__init__()
+        self.base = freq
+        self.F0 = F0
+    def forward(self, tokens, positions):
+        cuRoPE2D_func.apply( tokens.transpose(1,2), positions, self.base, self.F0 )
+        return tokens

extensions/curope/kernels.cu ADDED Viewed

	@@ -0,0 +1,108 @@

+/*
+  Copyright (C) 2022-present Naver Corporation. All rights reserved.
+  Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#define CHECK_CUDA(tensor) {\
+    TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
+    TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
+void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}
+template < typename scalar_t  >
+__global__ void rope_2d_cuda_kernel(
+        //scalar_t* __restrict__ tokens,
+        torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> tokens,
+        const int64_t* __restrict__ pos,
+        const float base,
+        const float fwd )
+        // const int N, const int H, const int D )
+{
+    // tokens shape = (B, N, H, D)
+    const int N = tokens.size(1);
+    const int H = tokens.size(2);
+    const int D = tokens.size(3);
+    // each block update a single token, for all heads
+    // each thread takes care of a single output
+    extern __shared__ float shared[];
+    float* shared_inv_freq = shared + D;
+    const int b = blockIdx.x / N;
+    const int n = blockIdx.x % N;
+    const int Q = D / 4;
+    // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D]
+    //              u_Y     v_Y     u_X      v_X
+    // shared memory: first, compute inv_freq
+    if (threadIdx.x < Q)
+        shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q));
+    __syncthreads();
+    // start of X or Y part
+    const int X = threadIdx.x < D/2 ? 0 : 1;
+    const int m = (X*D/2) + (threadIdx.x % Q);   // index of u_Y or u_X
+    // grab the cos,sin appropriate for me
+    const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q];
+    const float cos = cosf(freq);
+    const float sin = sinf(freq);
+    /*
+    float* shared_cos_sin = shared + D + D/4;
+    if ((threadIdx.x % (D/2)) < Q)
+        shared_cos_sin[m+0] = cosf(freq);
+    else
+        shared_cos_sin[m+Q] = sinf(freq);
+    __syncthreads();
+    const float cos = shared_cos_sin[m+0];
+    const float sin = shared_cos_sin[m+Q];
+    */
+    for (int h = 0; h < H; h++)
+    {
+        // then, load all the token for this head in shared memory
+        shared[threadIdx.x] = tokens[b][n][h][threadIdx.x];
+        __syncthreads();
+        const float u = shared[m];
+        const float v = shared[m+Q];
+        // write output
+        if ((threadIdx.x % (D/2)) < Q)
+            tokens[b][n][h][threadIdx.x] = u*cos - v*sin;
+        else
+            tokens[b][n][h][threadIdx.x] = v*cos + u*sin;
+    }
+}
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd )
+{
+    const int B = tokens.size(0); // batch size
+    const int N = tokens.size(1); // sequence length
+    const int H = tokens.size(2); // number of heads
+    const int D = tokens.size(3); // dimension per head
+    TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous");
+    TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous");
+    TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape");
+    TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4");
+    // one block for each layer, one thread per local-max
+    const int THREADS_PER_BLOCK = D;
+    const int N_BLOCKS = B * N; // each block takes care of H*D values
+    const int SHARED_MEM = sizeof(float) * (D + D/4);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] {
+        rope_2d_cuda_kernel<scalar_t> <<<N_BLOCKS, THREADS_PER_BLOCK, SHARED_MEM>>> (
+            //tokens.data_ptr<scalar_t>(),
+            tokens.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
+            pos.data_ptr<int64_t>(),
+            base, fwd); //, N, H, D );
+    }));
+}

extensions/curope/setup.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+from setuptools import setup
+from torch import cuda
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+# compile for all possible CUDA architectures
+all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split()
+# alternatively, you can list cuda archs that you want, eg:
+# all_cuda_archs = [
+    # '-gencode', 'arch=compute_70,code=sm_70',
+    # '-gencode', 'arch=compute_75,code=sm_75',
+    # '-gencode', 'arch=compute_80,code=sm_80',
+    # '-gencode', 'arch=compute_86,code=sm_86'
+# ]
+setup(
+    name = 'curope',
+    ext_modules = [
+        CUDAExtension(
+                name='curope',
+                sources=[
+                    "curope.cpp",
+                    "kernels.cu",
+                ],
+                extra_compile_args = dict(
+                    nvcc=['-O3','--ptxas-options=-v',"--use_fast_math"]+all_cuda_archs,
+                    cxx=['-O3'])
+                )
+    ],
+    cmdclass = {
+        'build_ext': BuildExtension
+    })

input/cam2world.pt ADDED Viewed

Binary file (1.25 kB). View file

input/intrinsics.pt ADDED Viewed

Binary file (1.07 kB). View file

main.py ADDED Viewed

	@@ -0,0 +1,198 @@

+bb = breakpoint
+import torch
+from torch.utils.data import DataLoader
+import wandb
+from argparse import ArgumentParser
+from datasets.octmae import OctMae
+from datasets.foundation_pose import FoundationPose
+from datasets.generic_loader import GenericLoader
+from utils.collate import collate
+from models.rayquery import RayQuery
+from engine import train_epoch, eval_epoch, eval_model
+import torch.nn as nn
+from models.rayquery import RayQuery, PointmapEncoder, RayEncoder
+from models.losses import *
+import utils.misc as misc
+import os
+from utils.viz import just_load_viz
+from utils.fusion import fuse_batch
+import socket
+import time
+from utils.augmentations import *
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("--dataset_train", type=str, default="TableOfCubes(size=10,n_views=2,seed=747)")
+    parser.add_argument("--dataset_test", type=str, default="TableOfCubes(size=10,n_views=2,seed=787)")
+    parser.add_argument("--dataset_just_load", type=str, default=None)
+    parser.add_argument("--logdir", type=str, default="logs")
+    parser.add_argument("--batch_size", type=int, default=5)
+    parser.add_argument("--n_epochs", type=int, default=100)
+    parser.add_argument("--n_workers", type=int, default=4)
+    parser.add_argument("--model", type=str, default="RayQuery(ray_enc=RayEncoder(),pointmap_enc=PointmapEncoder(),criterion=RayCompletion(ConfLoss(L21)))")
+    parser.add_argument("--save_every", type=int, default=1)
+    parser.add_argument("--resume", type=str, default=None)
+    parser.add_argument("--eval_every", type=int, default=3)
+    parser.add_argument("--wandb_project", type=str, default=None)
+    parser.add_argument("--wandb_run_name", type=str, default="init")
+    parser.add_argument("--just_load", action="store_true")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--rr_addr", type=str, default="0.0.0.0:"+os.getenv("RERUN_RECORDING","9876"))
+    parser.add_argument("--mesh", action="store_true")
+    parser.add_argument("--max_norm", type=float, default=-1)
+    parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR',
+                        help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+    parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0')
+    parser.add_argument('--warmup_epochs', type=int, default=10)
+    parser.add_argument('--weight_decay', type=float, default=0.01)
+    parser.add_argument('--normalize_mode',type=str,default='None')
+    parser.add_argument('--start_from',type=str,default=None)
+    parser.add_argument('--augmentor',type=str,default='None')
+    return parser.parse_args()
+def main(args):
+    load_dino = False
+    if not args.just_load:
+        dataset_train = eval(args.dataset_train)
+        dataset_test = eval(args.dataset_test)
+        if not dataset_train.prefetch_dino:
+            load_dino = True
+        rank, world_size, local_rank = misc.setup_distributed()
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset_train, num_replicas=world_size, rank=rank, shuffle=True
+        )
+        sampler_test = torch.utils.data.DistributedSampler(
+            dataset_test, num_replicas=world_size, rank=rank, shuffle=False
+        )
+        train_loader = DataLoader(
+            dataset_train, sampler=sampler_train, batch_size=args.batch_size, shuffle=False, collate_fn=collate,
+            num_workers=args.n_workers,
+            pin_memory=True,
+            prefetch_factor=2,
+            drop_last=True
+        )
+        test_loader = DataLoader(
+            dataset_test, sampler=sampler_test, batch_size=args.batch_size, shuffle=False, collate_fn=collate,
+            num_workers=args.n_workers,
+            pin_memory=True,
+            prefetch_factor=2,
+            drop_last=True
+        )
+        n_scenes_epoch = len(train_loader) * args.batch_size * world_size
+        print(f"Number of scenes in epoch: {n_scenes_epoch}")
+    else:
+        if args.dataset_just_load is None:
+            dataset = eval(args.dataset_train)
+        else:
+            dataset = eval(args.dataset_just_load)
+        if not dataset.prefetch_dino:
+            load_dino = True
+        rank, world_size, local_rank = misc.setup_distributed()
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset, num_replicas=world_size, rank=rank, shuffle=False
+        )
+        just_loader = DataLoader(dataset, sampler=sampler_train, batch_size=args.batch_size, shuffle=False, collate_fn=collate,
+            pin_memory=True,
+            drop_last=True
+        )
+    model = eval(args.model).to(args.device)
+    if args.augmentor != 'None':
+        augmentor = eval(args.augmentor)
+    else:
+        augmentor = None
+    if load_dino and len(model.dino_layers) > 0:
+        dino_model = torch.hub.load('facebookresearch/dinov2', "dinov2_vitl14_reg")
+        dino_model.eval()
+        dino_model.to("cuda")
+    else:
+        dino_model = None
+    # distribute model
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank],find_unused_parameters=True)
+    model_without_ddp = model.module if hasattr(model, 'module') else model
+    eff_batch_size = args.batch_size * misc.get_world_size()
+    if args.lr is None:  # only base_lr is specified
+        args.lr = args.blr * eff_batch_size / 256
+    param_groups = misc.add_weight_decay(model_without_ddp, args.weight_decay)
+    optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+    os.makedirs(args.logdir,exist_ok=True)
+    start_epoch = 0
+    print("Running on host %s" % socket.gethostname())
+    if args.resume and os.path.exists(os.path.join(args.resume, "checkpoint-latest.pth")):
+        checkpoint = torch.load(os.path.join(args.resume, "checkpoint-latest.pth"), map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        model_params = list(model.parameters())
+        print("Resume checkpoint %s" % args.resume)
+        if 'optimizer' in checkpoint and 'epoch' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            start_epoch = checkpoint['epoch'] + 1
+            print("With optim & sched!")
+        del checkpoint
+    elif args.start_from is not None:
+        checkpoint = torch.load(args.start_from, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        print("Start from checkpoint %s" % args.start_from)
+    if args.just_load:
+        with torch.no_grad():
+            while True:
+                #test_log_dict = eval_epoch(model,just_loader,device=args.device,dino_model=dino_model,args=args)
+                for data in just_loader:
+                    pred, gt, loss_dict, batch = eval_model(model,data,mode='viz',args=args,dino_model=dino_model,augmentor=augmentor)
+                    # cast to float32 for visualization
+                    gt = {k: v.float() for k, v in gt.items()}
+                    pred = {k: v.float() for k, v in pred.items()}
+                    #loss_dict = eval_model(model,data,mode='loss',device=args.device)
+                    #print(f"Loss: {loss_dict['loss']:.4f}")
+                    # summarize all keys in loss_dict in table
+                    print(f"{'Key':<10} {'Value':<10}")
+                    print("-"*20)
+                    for key, value in loss_dict.items():
+                        print(f"{key:<10}: {value:.4f}")
+                    print("-"*20)
+                    name = args.logdir
+                    addr = args.rr_addr
+                    if args.mesh:
+                        fused_meshes = fuse_batch(pred,gt,data, voxel_size=0.002)
+                    else:
+                        fused_meshes = None
+                    just_load_viz(pred,gt,batch,addr=addr,name=name,fused_meshes=fused_meshes)
+                    breakpoint()
+        return
+    else:
+        if args.wandb_project and misc.get_rank() == 0:
+            wandb.init(project=args.wandb_project, name=args.wandb_run_name, config=args)
+            log_wandb = args.wandb_project
+        else:
+            log_wandb = None
+        for epoch in range(start_epoch,args.n_epochs):
+            start_time = time.time()
+            log_dict = train_epoch(model,train_loader,optimizer,device=args.device,max_norm=args.max_norm,epoch=epoch,
+                                   log_wandb=log_wandb,batch_size=eff_batch_size,args=args,dino_model=dino_model,augmentor=augmentor)
+            end_time = time.time()
+            print(f"Epoch {epoch} train loss: {log_dict['loss']:.4f} grad_norm: {log_dict['grad_norm']:.4f} \n")
+            print(f"Time taken for epoch {epoch}: {end_time - start_time:.2f} seconds")
+            if epoch % args.eval_every == 0:
+                test_log_dict = eval_epoch(model,test_loader,device=args.device,dino_model=dino_model,args=args,augmentor=augmentor)
+                print(f"Epoch {epoch} test loss: {test_log_dict['loss']:.4f} \n")
+                if log_wandb:
+                    wandb_dict = {f"test_{k}":v for k,v in test_log_dict.items()}
+                    wandb.log(wandb_dict, step=(epoch+1)*n_scenes_epoch)
+            if epoch % args.save_every == 0:
+                # this saves the model every epoch and doesn't overwrite but it becomes tremendous, huge
+                #misc.save_model(args, epoch, model, optimizer)
+                misc.save_model(args, epoch, model_without_ddp, optimizer, epoch_name=f"latest")
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

models/blocks.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# Copied from: https://github.com/naver/croco/blob/743ee71a2a9bf57cea6832a9064a70a0597fcfcb/models/blocks.py
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+import torch
+import torch.nn as nn
+from itertools import repeat
+import collections.abc
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, x, xpos):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3)
+        q, k, v = [qkv[:,:,i] for i in range(3)]
+        # q,k,v = qkv.unbind(2)  # make torchscript happy (cannot use tensor as tuple)
+        if self.rope is not None:
+            q = self.rope(q, xpos)
+            k = self.rope(k, xpos)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x, xpos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, query, key, value, qpos, kpos):
+        B, Nq, C = query.shape
+        Nk = key.shape[1]
+        Nv = value.shape[1]
+        q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        if self.rope is not None:
+            q = self.rope(q, qpos)
+            k = self.rope(k, kpos)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class DecoderBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None,order='sa_ca'):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
+        self.order = order
+        self.batch_drop_path_prob = -drop_path if drop_path < 0. else 0.
+    def forward(self, x, y, xpos, ypos):
+        if self.order == 'sa_ca':
+            if self.batch_drop_path_prob==0.0 or not self.training or torch.rand(1).item()>=self.batch_drop_path_prob: x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+            y_ = self.norm_y(y)
+            if self.batch_drop_path_prob==0.0 or not self.training or torch.rand(1).item()>=self.batch_drop_path_prob: x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos))
+            if self.batch_drop_path_prob==0.0 or not self.training or torch.rand(1).item()>=self.batch_drop_path_prob: x = x + self.drop_path(self.mlp(self.norm3(x)))
+        elif self.order == 'ca_sa':
+            y_ = self.norm_y(y)
+            if self.batch_drop_path_prob==0.0 or not self.training or torch.rand(1).item()>=self.batch_drop_path_prob: x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos))
+            if self.batch_drop_path_prob==0.0 or not self.training or torch.rand(1).item()>=self.batch_drop_path_prob: x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+            if self.batch_drop_path_prob==0.0 or not self.training or torch.rand(1).item()>=self.batch_drop_path_prob: x = x + self.drop_path(self.mlp(self.norm3(x)))
+        return x, y
+# patch embedding
+class PositionGetter(object):
+    """ return positions of patches """
+    def __init__(self):
+        self.cache_positions = {}
+    def __call__(self, b, h, w, device):
+        if not (h,w) in self.cache_positions:
+            x = torch.arange(w, device=device)
+            y = torch.arange(h, device=device)
+            self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2)
+        pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone()
+        return pos
+class PatchEmbed(nn.Module):
+    """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed"""
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+        self.position_getter = PositionGetter()
+    def forward(self, x):
+        B, C, H, W = x.shape
+        torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
+        torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
+        x = self.proj(x)
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, pos
+    def _init_weights(self):
+        w = self.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

models/heads/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# head factory
+# --------------------------------------------------------
+from .linear_head import LinearPts3d
+from .dpt_head import create_dpt_head, create_dpt_head_mask, create_dpt_head_depth
+def head_factory(head_type, output_mode, net, has_conf=False):
+    """" build a prediction head for the decoder
+    """
+    if head_type == 'linear' and output_mode == 'pts3d':
+        return LinearPts3d(net, has_conf)
+    if head_type == 'linear_depth' and output_mode == 'pts3d':
+        return LinearPts3d(net, has_conf,mode='depth')
+    if head_type == 'linear_classifier' and output_mode == 'pts3d':
+        return LinearPts3d(net, has_conf,mode='classifier')
+    elif head_type == 'dpt' and output_mode == 'pts3d':
+        return create_dpt_head(net, has_conf=has_conf)
+    elif head_type == 'dpt_depth' and output_mode == 'pts3d':
+        return create_dpt_head_depth(net, has_conf=has_conf)
+    elif head_type == 'dpt_mask' and output_mode == 'pts3d':
+        return create_dpt_head_mask(net, has_conf=has_conf)
+    else:
+        raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}")

models/heads/dpt_head.py ADDED Viewed

	@@ -0,0 +1,582 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from typing import Union, Tuple, Iterable, List, Optional, Dict
+from .postprocess import postprocess
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+def make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand == True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer_rn = nn.ModuleList([
+        scratch.layer1_rn,
+        scratch.layer2_rn,
+        scratch.layer3_rn,
+        scratch.layer4_rn,
+    ])
+    return scratch
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = 1
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        width_ratio=1,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.width_ratio = width_ratio
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = 1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            if self.width_ratio != 1:
+                res = F.interpolate(res, size=(output.shape[2], output.shape[3]), mode='bilinear')
+            output = self.skip_add.add(output, res)
+            # output += res
+        output = self.resConfUnit2(output)
+        if self.width_ratio != 1:
+            # and output.shape[3] < self.width_ratio * output.shape[2]
+            #size=(image.shape[])
+            if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio:
+                shape = 3 * output.shape[3]
+            else:
+                shape = int(self.width_ratio * 2 * output.shape[2])
+            output  = F.interpolate(output, size=(2* output.shape[2], shape), mode='bilinear')
+        else:
+            output = nn.functional.interpolate(output, scale_factor=2,
+                    mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output
+def make_fusion_block(features, use_bn, width_ratio=1):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        width_ratio=width_ratio,
+    )
+class Interpolate(nn.Module):
+    """Interpolation module."""
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+        return x
+class DPTOutputAdapter(nn.Module):
+    """DPT output adapter.
+    :param num_cahnnels: Number of output channels
+    :param stride_level: stride level compared to the full-sized image.
+        E.g. 4 for 1/4th the size of the image.
+    :param patch_size_full: Int or tuple of the patch size over the full image size.
+        Patch size for smaller inputs will be computed accordingly.
+    :param hooks: Index of intermediate layers
+    :param layer_dims: Dimension of intermediate layers
+    :param feature_dim: Feature dimension
+    :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression
+    :param use_bn: If set to True, activates batch norm
+    :param dim_tokens_enc:  Dimension of tokens coming from encoder
+    """
+    def __init__(self,
+                 num_channels: int = 1,
+                 stride_level: int = 1,
+                 patch_size: Union[int, Tuple[int, int]] = 16,
+                 main_tasks: Iterable[str] = ('rgb',),
+                 hooks: List[int] = [2, 5, 8, 11],
+                 layer_dims: List[int] = [96, 192, 384, 768],
+                 feature_dim: int = 256,
+                 last_dim: int = 32,
+                 use_bn: bool = False,
+                 dim_tokens_enc: Optional[int] = None,
+                 head_type: str = 'regression',
+                 output_width_ratio=1,
+                 **kwargs):
+        super().__init__()
+        self.num_channels = num_channels
+        self.stride_level = stride_level
+        self.patch_size = pair(patch_size)
+        self.main_tasks = main_tasks
+        self.hooks = hooks
+        self.layer_dims = layer_dims
+        self.feature_dim = feature_dim
+        self.dim_tokens_enc = dim_tokens_enc * len(self.main_tasks) if dim_tokens_enc is not None else None
+        self.head_type = head_type
+        # Actual patch height and width, taking into account stride of input
+        self.P_H = max(1, self.patch_size[0] // stride_level)
+        self.P_W = max(1, self.patch_size[1] // stride_level)
+        self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False)
+        self.scratch.refinenet1 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet2 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet3 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet4 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        if self.head_type == 'regression':
+            # The "DPTDepthModel" head
+            self.head = nn.Sequential(
+                nn.Conv2d(feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1),
+                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+                nn.Conv2d(feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(True),
+                nn.Conv2d(last_dim, self.num_channels, kernel_size=1, stride=1, padding=0)
+            )
+        elif self.head_type == 'semseg':
+            # The "DPTSegmentationModel" head
+            self.head = nn.Sequential(
+                nn.Conv2d(feature_dim, feature_dim, kernel_size=3, padding=1, bias=False),
+                nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(),
+                nn.ReLU(True),
+                nn.Dropout(0.1, False),
+                nn.Conv2d(feature_dim, self.num_channels, kernel_size=1),
+                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            )
+        else:
+            raise ValueError('DPT head_type must be "regression" or "semseg".')
+        if self.dim_tokens_enc is not None:
+            self.init(dim_tokens_enc=dim_tokens_enc)
+    def init(self, dim_tokens_enc=768):
+        """
+        Initialize parts of decoder that are dependent on dimension of encoder tokens.
+        Should be called when setting up MultiMAE.
+        :param dim_tokens_enc: Dimension of tokens coming from encoder
+        """
+        #print(dim_tokens_enc)
+        # Set up activation postprocessing layers
+        if isinstance(dim_tokens_enc, int):
+            dim_tokens_enc = 4 * [dim_tokens_enc]
+        self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc]
+        self.act_1_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[0],
+                out_channels=self.layer_dims[0],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=self.layer_dims[0],
+                out_channels=self.layer_dims[0],
+                kernel_size=4, stride=4, padding=0,
+                bias=True, dilation=1, groups=1,
+            )
+        )
+        self.act_2_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[1],
+                out_channels=self.layer_dims[1],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=self.layer_dims[1],
+                out_channels=self.layer_dims[1],
+                kernel_size=2, stride=2, padding=0,
+                bias=True, dilation=1, groups=1,
+            )
+        )
+        self.act_3_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[2],
+                out_channels=self.layer_dims[2],
+                kernel_size=1, stride=1, padding=0,
+            )
+        )
+        self.act_4_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[3],
+                out_channels=self.layer_dims[3],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.Conv2d(
+                in_channels=self.layer_dims[3],
+                out_channels=self.layer_dims[3],
+                kernel_size=3, stride=2, padding=1,
+            )
+        )
+        self.act_postprocess = nn.ModuleList([
+            self.act_1_postprocess,
+            self.act_2_postprocess,
+            self.act_3_postprocess,
+            self.act_4_postprocess
+        ])
+    def adapt_tokens(self, encoder_tokens):
+        # Adapt tokens
+        x = []
+        x.append(encoder_tokens[:, :])
+        x = torch.cat(x, dim=-1)
+        return x
+    def forward(self, encoder_tokens: List[torch.Tensor], image_size):
+            #input_info: Dict):
+        assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
+        H, W = image_size
+        # Number of patches in height and width
+        N_H = H // (self.stride_level * self.P_H)
+        N_W = W // (self.stride_level * self.P_W)
+        # Hook decoder onto 4 layers from specified ViT layers
+        layers = [encoder_tokens[hook] for hook in self.hooks]
+        # Extract only task-relevant tokens and ignore global tokens.
+        layers = [self.adapt_tokens(l) for l in layers]
+        # Reshape tokens to spatial representation
+        layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
+        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+        # Project layers to chosen feature dim
+        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+        # Fuse layers using refinement stages
+        path_4 = self.scratch.refinenet4(layers[3])
+        path_3 = self.scratch.refinenet3(path_4, layers[2])
+        path_2 = self.scratch.refinenet2(path_3, layers[1])
+        path_1 = self.scratch.refinenet1(path_2, layers[0])
+        # Output head
+        out = self.head(path_1)
+        return out
+class DPTOutputAdapter_fix(DPTOutputAdapter):
+    """
+    Adapt croco's DPTOutputAdapter implementation for dust3r:
+    remove duplicated weigths, and fix forward for dust3r
+    """
+    def init(self, dim_tokens_enc=768,**kwargs):
+        super().init(dim_tokens_enc,**kwargs)
+        # these are duplicated weights
+        del self.act_1_postprocess
+        del self.act_2_postprocess
+        del self.act_3_postprocess
+        del self.act_4_postprocess
+    def forward(self, encoder_tokens: List[torch.Tensor], image_size=None):
+        assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
+        # H, W = input_info['image_size']
+        image_size = self.image_size if image_size is None else image_size
+        H, W = image_size
+        # Number of patches in height and width
+        N_H = H // (self.stride_level * self.P_H)
+        N_W = W // (self.stride_level * self.P_W)
+        # Hook decoder onto 4 layers from specified ViT layers
+        layers = [encoder_tokens[hook] for hook in self.hooks]
+        # Extract only task-relevant tokens and ignore global tokens.
+        layers = [self.adapt_tokens(l) for l in layers]
+        # Reshape tokens to spatial representation
+        layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
+        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+        # Project layers to chosen feature dim
+        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+        # Fuse layers using refinement stages
+        path_4 = self.scratch.refinenet4(layers[3])[:, :, :layers[2].shape[2], :layers[2].shape[3]]
+        path_3 = self.scratch.refinenet3(path_4, layers[2])
+        path_2 = self.scratch.refinenet2(path_3, layers[1])
+        path_1 = self.scratch.refinenet1(path_2, layers[0])
+        # Output head
+        out = self.head(path_1)
+        return out
+class PixelwiseTaskWithDPT(nn.Module):
+    """ DPT module for dust3r, can return 3D points + confidence for all pixels"""
+    def __init__(self, *, n_cls_token=0, hooks_idx=None, dim_tokens=None,
+                 output_width_ratio=1, num_channels=1, postprocess=None, depth_mode=None, conf_mode=None, classifier_mode=None, **kwargs):
+        super(PixelwiseTaskWithDPT, self).__init__()
+        self.return_all_layers = True  # backbone needs to return all layers
+        self.postprocess = postprocess
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        self.classifier_mode = classifier_mode
+        assert n_cls_token == 0, "Not implemented"
+        dpt_args = dict(output_width_ratio=output_width_ratio,
+                        num_channels=num_channels,
+                        **kwargs)
+        if hooks_idx is not None:
+            dpt_args.update(hooks=hooks_idx)
+        self.dpt = DPTOutputAdapter_fix(**dpt_args)
+        dpt_init_args = {} if dim_tokens is None else {'dim_tokens_enc': dim_tokens}
+        self.dpt.init(**dpt_init_args)
+    def forward(self, x, img_info):
+        out = self.dpt(x, image_size=(img_info[0], img_info[1]))
+        if self.postprocess:
+            out = self.postprocess(out, self.depth_mode, self.conf_mode,self.classifier_mode)
+        return out
+def create_dpt_head(net, has_conf=False):
+    """
+    return PixelwiseTaskWithDPT for given net params
+    """
+    assert net.dec_depth > 9
+    l2 = net.dec_depth - 1
+    feature_dim = 256
+    last_dim = feature_dim//2
+    out_nchan = 3
+    ed = net.enc_embed_dim
+    dd = net.dec_embed_dim
+    return PixelwiseTaskWithDPT(num_channels=out_nchan + has_conf,
+                                feature_dim=feature_dim,
+                                last_dim=last_dim,
+                                hooks_idx=[0, l2*2//4, l2*3//4, l2],
+                                dim_tokens=[ed, dd, dd, dd],
+                                postprocess=postprocess,
+                                depth_mode=net.depth_mode,
+                                conf_mode=net.conf_mode,
+                                head_type='regression',
+                                patch_size=net.patch_size)
+def create_dpt_head_depth(net, has_conf=False):
+    """
+    return PixelwiseTaskWithDPT for given net params
+    """
+    assert net.dec_depth > 9
+    l2 = net.dec_depth - 1
+    feature_dim = 256
+    last_dim = feature_dim//2
+    out_nchan = 1
+    ed = net.enc_embed_dim
+    dd = net.dec_embed_dim
+    return PixelwiseTaskWithDPT(num_channels=out_nchan + has_conf,
+                                feature_dim=feature_dim,
+                                last_dim=last_dim,
+                                hooks_idx=[0, l2*2//4, l2*3//4, l2],
+                                dim_tokens=[ed, dd, dd, dd],
+                                postprocess=postprocess,
+                                depth_mode=net.depth_mode,
+                                conf_mode=net.conf_mode,
+                                head_type='regression',
+                                patch_size=net.patch_size)
+def create_dpt_head_mask(net, has_conf=False):
+    """
+    return PixelwiseTaskWithDPT for given net params
+    """
+    assert net.dec_depth > 9
+    l2 = net.dec_depth - 1
+    feature_dim = 256
+    last_dim = feature_dim//2
+    out_nchan = 3
+    ed = net.enc_embed_dim
+    dd = net.dec_embed_dim
+    return PixelwiseTaskWithDPT(num_channels=1 + has_conf,
+                                feature_dim=feature_dim,
+                                last_dim=last_dim,
+                                hooks_idx=[0, l2*2//4, l2*3//4, l2],
+                                dim_tokens=[ed, dd, dd, dd],
+                                postprocess=postprocess,
+                                depth_mode=net.depth_mode,
+                                conf_mode=net.conf_mode,
+                                classifier_mode=net.classifier_mode,
+                                head_type='regression',
+                                patch_size=net.patch_size)

models/heads/linear_head.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .postprocess import postprocess
+class LinearPts3d (nn.Module):
+    """
+    Linear head for dust3r
+    Each token outputs: - 16x16 3D points (+ confidence)
+    """
+    def __init__(self, net, has_conf=False,mode='pts3d'):
+        super().__init__()
+        self.patch_size = net.patch_size
+        self.depth_mode = net.depth_mode
+        self.conf_mode = net.conf_mode
+        self.has_conf = has_conf
+        self.mode = mode
+        self.classifier_mode = None
+        if self.mode == 'pts3d':
+            self.proj = nn.Linear(net.dec_embed_dim, (3 + has_conf)*self.patch_size**2)
+        elif self.mode == 'depth':
+            self.proj = nn.Linear(net.dec_embed_dim, (1 + has_conf)*self.patch_size**2)
+        elif self.mode == 'classifier':
+            self.proj = nn.Linear(net.dec_embed_dim, (1 + has_conf)*self.patch_size**2)
+            self.classifier_mode = net.classifier_mode
+    def setup(self, croconet):
+        pass
+    def forward(self, decout, img_shape):
+        H, W = img_shape
+        tokens = decout[-1]
+        B, S, D = tokens.shape
+        # extract 3D points
+        feat = self.proj(tokens)  # B,S,D
+        feat = feat.transpose(-1, -2).view(B, -1, H//self.patch_size, W//self.patch_size)
+        feat = F.pixel_shuffle(feat, self.patch_size)  # B,3,H,W
+        # permute + norm depth
+        return postprocess(feat, self.depth_mode, self.conf_mode,self.classifier_mode)

models/heads/postprocess.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+def postprocess(out, depth_mode, conf_mode,classifier_mode=None):
+    """
+    extract 3D points/confidence from prediction head output
+    """
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,3
+    if classifier_mode is None:
+        if fmap.shape[-1] == 4:
+            res = dict(pointmaps=reg_dense_pts3d(fmap[:, :, :, :-1], mode=depth_mode))
+        else:
+            res = dict(depths=reg_dense_depth(fmap[:, :, :, 0], mode=depth_mode))
+        if conf_mode is not None:
+            res['conf_pointmaps'] = reg_dense_conf(fmap[:, :, :, -1], mode=conf_mode)
+    else:
+        res = dict(classifier=reg_dense_classifier(fmap[:, :, :, 0], mode=classifier_mode))
+        if conf_mode is not None:
+            res['conf_classifier'] = reg_dense_conf(fmap[:, :, :, 1], mode=conf_mode)
+    return res
+def reg_dense_classifier(x, mode):
+    """
+    extract classifier from prediction head output
+    """
+    mode, vmin, vmax = mode
+    #return torch.sigmoid(x)
+    return x
+def reg_dense_depth(x, mode):
+    """
+    extract depth from prediction head output
+    """
+    mode, vmin, vmax = mode
+    no_bounds = (vmin == -float('inf')) and (vmax == float('inf'))
+    assert no_bounds
+    if mode == 'linear':
+        return x
+    elif mode == 'square':
+        return x.square().clip(min=vmin, max=vmax)
+    elif mode == 'exp':
+        return torch.exp(x).clip(min=vmin, max=vmax)
+    else:
+        raise ValueError(f'bad {mode=}')
+def reg_dense_pts3d(xyz, mode):
+    """
+    extract 3D points from prediction head output
+    """
+    mode, vmin, vmax = mode
+    no_bounds = (vmin == -float('inf')) and (vmax == float('inf'))
+    assert no_bounds
+    if mode == 'linear':
+        if no_bounds:
+            return xyz  # [-inf, +inf]
+        return xyz.clip(min=vmin, max=vmax)
+    # distance to origin
+    d = xyz.norm(dim=-1, keepdim=True)
+    xyz = xyz / d.clip(min=1e-8)
+    if mode == 'square':
+        return xyz * d.square()
+    if mode == 'exp':
+        return xyz * torch.expm1(d)
+    raise ValueError(f'bad {mode=}')
+def reg_dense_conf(x, mode):
+    """
+    extract confidence from prediction head output
+    """
+    mode, vmin, vmax = mode
+    if mode == 'exp':
+        return vmin + x.exp().clip(max=vmax-vmin)
+    if mode == 'sigmoid':
+        return (vmax - vmin) * torch.sigmoid(x) + vmin
+    raise ValueError(f'bad {mode=}')

models/losses.py ADDED Viewed

	@@ -0,0 +1,257 @@

+bb = breakpoint
+import torch
+import torch.nn as nn
+import copy
+from utils.geometry import normalize_pointcloud
+class Criterion (nn.Module):
+    def __init__(self, criterion=None):
+        super().__init__()
+        self.criterion = copy.deepcopy(criterion)
+    def get_name(self):
+        return f'{type(self).__name__}({self.criterion})'
+class CrocoLoss (nn.Module):
+    def __init__(self,mode='vanilla',eps=1e-4):
+        super().__init__()
+        self.mode = mode
+    def get_name(self):
+        return f'CrocoLoss({self.mode})'
+    def forward(self, pred, gt, **kw):
+        pred_pts = pred['pointmaps']
+        conf = pred['conf']
+        if self.mode == 'vanilla':
+            loss = torch.abs(gt-pred_pts)/(torch.exp(conf)) + conf
+        elif self.mode == 'bounded_1':
+            a=0.25
+            b=4.
+            conf = (b-a)*torch.sigmoid(conf) + a
+            loss = torch.abs(gt-pred_pts)/(conf) + torch.log(conf)
+        elif self.mode == 'bounded_2':
+            a = 3.0
+            b = 3.0
+            conf = 2*a * (torch.sigmoid(conf/b)-0.5)
+            loss = torch.abs(gt-pred_pts)/torch.exp(conf) + conf
+        return loss.mean()
+class SMDLoss (nn.Module):
+    def __init__(self,raw_loss,mode='linear'):
+        super().__init__()
+        self.mode = mode
+        self.raw_loss = raw_loss
+    def get_name(self):
+        return f'SMDLoss({self.raw_loss},{self.mode})'
+    def forward(self, pred, gt,eps, **kw):
+        p_gt = compute_probs(pred,gt,eps=eps)
+        # filtering out nan values
+        loss = self.raw_loss(p_gt)
+        loss_mask = ~torch.isnan(p_gt) & (loss != torch.inf).bool()
+        loss = loss[loss_mask]
+        return loss.mean()
+# https://github.com/naver/dust3r/blob/c9e9336a6ba7c1f1873f9295852cea6dffaf770d/dust3r/losses.py#L197
+class ConfLoss (nn.Module):
+    """ Weighted regression by learned confidence.
+        Assuming the input pixel_loss is a pixel-level regression loss.
+    Principle:
+        high-confidence means high conf = 0.1 ==> conf_loss = x / 10 + alpha*log(10)
+        low  confidence means low  conf = 10  ==> conf_loss = x * 10 - alpha*log(10)
+        alpha: hyperparameter
+    """
+    def __init__(self, raw_loss, alpha=0.2,skip_conf=False):
+        super().__init__()
+        assert alpha > 0
+        self.alpha = alpha
+        self.raw_loss = raw_loss
+        self.skip_conf = skip_conf
+    def get_name(self):
+        return f'ConfLoss({self.raw_loss})'
+    def get_conf_log(self, x):
+        return x, torch.log(x)
+    def forward(self, pred, gt,conf, **kw):
+        # compute per-pixel loss
+        loss = self.raw_loss(gt, pred, **kw)
+        # weight by confidence
+        if not self.skip_conf:
+            conf, log_conf = self.get_conf_log(conf)
+            conf_loss = loss * conf - self.alpha * log_conf
+            ## average + nan protection (in case of no valid pixels at all)
+            conf_loss = conf_loss.mean() if conf_loss.numel() > 0 else 0
+            return conf_loss
+        else:
+            return loss.mean()
+class BCELoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def get_name(self):
+        return f'BCELoss()'
+    def forward(self, gt, pred):
+   #     return torch.nn.functional.binary_cross_entropy(pred, gt)
+        return torch.nn.functional.binary_cross_entropy_with_logits(pred, gt)
+class ClassifierLoss(nn.Module):
+    def __init__(self,criterion):
+        super().__init__()
+        self.criterion = criterion
+    def get_name(self):
+        return f'ClassifierLoss({self.criterion})'
+    def forward(self, pred, gt):
+        return self.criterion(pred, gt)
+class BaseCriterion(nn.Module):
+    def __init__(self, reduction='none'):
+        super().__init__()
+        self.reduction = reduction
+class NLLLoss (BaseCriterion):
+    """ Negative log likelihood loss """
+    def forward(self, pred):
+        # assuming the pred is already a log (for stability sake)
+        return -pred
+        #return -torch.log(pred)
+class LLoss (BaseCriterion):
+    """ L-norm loss
+    """
+    def forward(self, a, b):
+        assert a.shape == b.shape and a.ndim >= 2 and 1 <= a.shape[-1] <= 3, f'Bad shape = {a.shape}'
+        dist = self.distance(a, b)
+        assert dist.ndim == a.ndim - 1  # one dimension less
+        if self.reduction == 'none':
+            return dist
+        if self.reduction == 'sum':
+            return dist.sum()
+        if self.reduction == 'mean':
+            return dist.mean() if dist.numel() > 0 else dist.new_zeros(())
+        raise ValueError(f'bad {self.reduction=} mode')
+    def distance(self, a, b):
+        raise NotImplementedError()
+class L21Loss (LLoss):
+    """ Euclidean distance between 3d points  """
+    def distance(self, a, b):
+        return torch.norm(a - b, dim=-1)
+L21 = L21Loss()
+def apply_log_to_norm(xyz):
+    d = xyz.norm(dim=-1, keepdim=True)
+    xyz = xyz / d.clip(min=1e-8)
+    xyz = xyz * torch.log1p(d)
+    return xyz
+class DepthCompletion (Criterion):
+    def __init__(self, criterion, classifier_criterion=None,norm_mode='?None', loss_in_log=False,device='cuda',lambda_classifier=1.0):
+        super().__init__(criterion)
+        self.criterion.reduction = 'none'
+        self.loss_in_log = loss_in_log
+        self.device = device
+        self.lambda_classifier = lambda_classifier
+        self.classifier_criterion = classifier_criterion
+        if norm_mode.startswith('?'):
+            # do no norm pts from metric scale datasets
+            self.norm_all = False
+            self.norm_mode = norm_mode[1:]
+        else:
+            self.norm_all = True
+            self.norm_mode = norm_mode
+    def forward(self, pred_dict, gt_dict,**kw):
+        gt_depths = gt_dict['depths']
+        pred_depths = pred_dict['depths']
+        gt_masks = gt_dict['valid_masks']
+        if gt_masks.sum() == 0:
+            return None
+        else:
+            gt_depths_masked = gt_depths[gt_masks].view(-1,1)
+            pred_depths_masked = pred_depths[gt_masks].view(-1,1)
+            # this is a loss on the points on the objects
+            loss_dict =  {'loss_points':self.criterion(pred_depths_masked, gt_depths_masked,pred_dict['conf_pointmaps'][gt_masks])}
+            # loss on predicting a mask for the points on the objects
+            if 'classifier' in pred_dict and self.classifier_criterion is not None:
+                loss_dict['loss_classifier'] = self.classifier_criterion(pred_dict['classifier'], gt_dict['valid_masks'].float(),pred_dict['conf_classifier'])
+                loss_dict['loss'] = loss_dict['loss_points'] + self.lambda_classifier * loss_dict['loss_classifier']
+            else:
+                loss_dict['loss'] = loss_dict['loss_points']
+            return loss_dict
+class RayCompletion (Criterion):
+    def __init__(self, criterion, classifier_criterion=None,norm_mode='?None', loss_in_log=False,device='cuda',lambda_classifier=1.0):
+        super().__init__(criterion)
+        self.criterion.reduction = 'none'
+        self.loss_in_log = loss_in_log
+        self.device = device
+        self.lambda_classifier = lambda_classifier
+        self.classifier_criterion = classifier_criterion
+        if norm_mode.startswith('?'):
+            # do no norm pts from metric scale datasets
+            self.norm_all = False
+            self.norm_mode = norm_mode[1:]
+        else:
+            self.norm_all = True
+            self.norm_mode = norm_mode
+    def get_all_pts3d(self, gt_dict, pred_dict):
+        gt_pts1 = gt_dict['pointmaps']
+        #gt_pts_context = gt_dict['pointmaps_context'][:,0] # we use the first camera given as input for normalization, in our current case that's the only cam
+        if 'pointmaps' in pred_dict:
+            pr_pts1 = pred_dict['pointmaps']
+        else:
+            pr_pts1 = None
+        mask = gt_dict['valid_masks'].clone()
+        # normalize 3d points
+        norm_factor = None
+        return gt_pts1, pr_pts1, mask, norm_factor
+    def forward(self, pred_dict, gt_dict, eps=None,**kw):
+        gt_pts1, pred_pts1, mask, norm_factor = \
+            self.get_all_pts3d(gt_dict, pred_dict, **kw)
+        if mask.sum() == 0:
+            return None
+        else:
+            mask_repeated = mask.unsqueeze(-1).repeat(1,1,1,3)
+            if norm_factor is not None:
+                pred_pts1 = pred_pts1 / norm_factor
+                gt_pts1 = gt_pts1 / norm_factor
+            pred_pts1 = pred_pts1[mask_repeated].reshape(-1,3)
+            gt_pts1 = gt_pts1[mask_repeated].reshape(-1,3)
+            if self.loss_in_log and self.loss_in_log != 'before':
+                # this only make sense when depth_mode == 'exp'
+                pred_pts1 = apply_log_to_norm(pred_pts1)
+                gt_pts1 = apply_log_to_norm(gt_pts1)
+            # this is a loss on the points on the objects
+            loss_dict =  {'loss_points':self.criterion(pred_pts1, gt_pts1,pred_dict['conf_pointmaps'][mask])}
+            # loss on predicting a mask for the points on the objects
+            if 'classifier' in pred_dict and self.classifier_criterion is not None:
+                loss_dict['loss_classifier'] = self.classifier_criterion(pred_dict['classifier'], gt_dict['valid_masks'].float(),pred_dict['conf_classifier'])
+                loss_dict['loss'] = loss_dict['loss_points'] + self.lambda_classifier * loss_dict['loss_classifier']
+            else:
+                loss_dict['loss'] = loss_dict['loss_points']
+            return loss_dict

models/pos_embed.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+import numpy as np
+import torch
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [n_cls_token+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if n_cls_token>0:
+        pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+#----------------------------------------------------------
+# RoPE2D: RoPE implementation in 2D
+#----------------------------------------------------------
+try:
+    from extensions.curope import cuRoPE2D
+    RoPE2D = cuRoPE2D
+except ImportError:
+    print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead')
+    class RoPE2D(torch.nn.Module):
+        def __init__(self, freq=100.0, F0=1.0):
+            super().__init__()
+            self.base = freq
+            self.F0 = F0
+            self.cache = {}
+        def get_cos_sin(self, D, seq_len, device, dtype):
+            if (D,seq_len,device,dtype) not in self.cache:
+                inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+                t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+                freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+                freqs = torch.cat((freqs, freqs), dim=-1)
+                cos = freqs.cos() # (Seq, Dim)
+                sin = freqs.sin()
+                self.cache[D,seq_len,device,dtype] = (cos,sin)
+            return self.cache[D,seq_len,device,dtype]
+        @staticmethod
+        def rotate_half(x):
+            x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+            return torch.cat((-x2, x1), dim=-1)
+        def apply_rope1d(self, tokens, pos1d, cos, sin):
+            assert pos1d.ndim==2
+            cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+            sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+            return (tokens * cos) + (self.rotate_half(tokens) * sin)
+        def forward(self, tokens, positions):
+            """
+            input:
+                * tokens: batch_size x nheads x ntokens x dim
+                * positions: batch_size x ntokens x 2 (y and x position of each token)
+            output:
+                * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+            """
+            assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
+            D = tokens.size(3) // 2
+            assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
+            cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
+            # split features into two along the feature dimension, and apply rope1d on each half
+            y, x = tokens.chunk(2, dim=-1)
+            y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
+            x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
+            tokens = torch.cat((y, x), dim=-1)
+            return tokens

models/rayquery.py ADDED Viewed

	@@ -0,0 +1,227 @@

+bb = breakpoint
+import torch
+import torch.nn as nn
+from models.blocks import DecoderBlock, Block, PatchEmbed, PositionGetter
+from models.pos_embed import get_2d_sincos_pos_embed, RoPE2D
+from models.losses import *
+from utils.geometry import center_pointmaps, compute_rays
+from models.heads import head_factory
+def init_weights(m):
+    if isinstance(m, nn.Linear):
+        # we use xavier_uniform following official JAX ViT:
+        torch.nn.init.xavier_uniform_(m.weight)
+        if isinstance(m, nn.Linear) and m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.LayerNorm):
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+        if m.weight is not None:
+            nn.init.constant_(m.weight, 1.0)
+    elif isinstance(m, nn.Parameter):
+        nn.init.normal_(m, std=0.02)
+class RayEncoder(nn.Module):
+    def __init__(self,
+                 dim=256,
+                 patch_size=8,
+                 img_size=(128,128),
+                 depth=3,
+                 num_heads=4,
+                 pos_embed='RoPE100',
+                 ):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_embed = PatchEmbed(img_size=self.img_size, patch_size=patch_size, in_chans=2, embed_dim=dim)
+        self.dim = dim
+        if pos_embed.startswith('RoPE'):
+            freq = float(pos_embed[len('RoPE'):])
+            self.rope = RoPE2D(freq=freq)
+        else:
+            self.rope = None
+        self.blocks = nn.ModuleList([Block(dim=dim, num_heads=num_heads,rope=self.rope) for _ in range(depth)])
+        self.initialize_weights()
+    def initialize_weights(self):
+        # patch embed
+        self.patch_embed._init_weights()
+        # linears and layer norms
+        self.apply(init_weights)
+    def forward(self, rays):
+        rays = rays.permute(0,3,1,2)
+        rays, pos = self.patch_embed(rays)
+        for blk in self.blocks:
+            rays = blk(rays, pos)
+        return rays, pos
+class PointmapEncoder(nn.Module):
+    def __init__(self,
+                 dim=256,
+                 patch_size=8,
+                 img_size=(128,128),
+                 depth=3,
+                 num_heads=4,
+                 pos_embed='RoPE100',
+                 ):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_embed = PatchEmbed(img_size=self.img_size, patch_size=patch_size, in_chans=3, embed_dim=dim)
+        self.dim = dim
+        self.patch_size = patch_size
+        if pos_embed.startswith('RoPE'):
+            freq = float(pos_embed[len('RoPE'):])
+            self.rope = RoPE2D(freq=freq)
+        else:
+            self.rope = None
+        self.blocks = nn.ModuleList([Block(dim=dim, num_heads=num_heads,rope=self.rope) for _ in range(depth)])
+        self.masked_token = nn.Parameter(torch.randn(1,1,3))
+        self.initialize_weights()
+    def initialize_weights(self):
+        # patch embed
+        self.patch_embed._init_weights()
+        # linears and layer norms
+        self.apply(init_weights)
+    def forward(self, pointmaps,masks=None):
+        # replace masked points (not on object) with a learned token
+        pointmaps[~masks] = self.masked_token.to(pointmaps.dtype).to(pointmaps.device)
+        pointmaps = pointmaps.permute(0,3,1,2)
+        pointmaps, pos = self.patch_embed(pointmaps)
+        for blk in self.blocks:
+            pointmaps = blk(pointmaps, pos)
+        return pointmaps, pos
+class RayQuery(nn.Module):
+    def __init__(self,
+                 ray_enc=RayEncoder(),
+                 pointmap_enc=PointmapEncoder(),
+                 dec_pos_embed='RoPE100',
+                 decoder_dim=256,
+                 decoder_depth=3,
+                 decoder_num_heads=4,
+                 imshape=(128,128),
+                 pts_head_type='dpt',
+                 classifier_head_type='dpt_mask',
+                 criterion=ConfLoss(L21),
+                 return_all_blocks=True,
+                 depth_mode=('exp',-float('inf'),float('inf')),
+                 conf_mode=('exp',1,float('inf')),
+                 classifier_mode=('raw',0,1),
+                 dino_layers=[23],
+                 ):
+        super().__init__()
+        self.ray_enc = ray_enc
+        self.pointmap_enc = pointmap_enc
+        self.dec_depth = decoder_depth
+        self.dec_embed_dim = decoder_dim
+        self.enc_embed_dim = ray_enc.dim
+        self.patch_size = pointmap_enc.patch_size
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        self.classifier_mode = classifier_mode
+        self.skip_dino = len(dino_layers) == 0
+        self.pts_head_type = pts_head_type
+        self.classifier_head_type = classifier_head_type
+        if dec_pos_embed.startswith('RoPE'):
+            self.dec_pos_embed = RoPE2D(freq=100.0)
+        else:
+            raise NotImplementedError(f'{dec_pos_embed} not implemented')
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(dim=decoder_dim, num_heads=decoder_num_heads,
+                                                          rope=self.dec_pos_embed) for _ in range(decoder_depth)])
+        self.pts_head = head_factory(pts_head_type, 'pts3d', self, has_conf=True)
+        self.classifier_head = head_factory(classifier_head_type, 'pts3d', self, has_conf=True)
+        self.imshape = imshape
+        self.criterion = criterion
+        self.return_all_blocks = return_all_blocks
+        # dino projection
+        self.dino_layers = dino_layers
+        self.dino_proj = nn.Linear(1024 * len(dino_layers), decoder_dim)
+        self.dino_pos_getter = PositionGetter()
+        self.initialize_weights()
+    def initialize_weights(self):
+        self.apply(init_weights)
+    def forward_encoders(self, rays, pointmaps,masks=None):
+        # encode rays
+        rays, rays_pos = self.ray_enc(rays)
+        # encode pointmaps
+        B, H, W, C = pointmaps.shape
+        pointmaps = pointmaps.reshape(B,H,W,C) # each pointmap is encoded separately
+        pointmaps, pointmaps_pos = self.pointmap_enc(pointmaps,masks=masks)
+        new_shape = pointmaps.shape
+        pointmaps = pointmaps.reshape(new_shape[0],*new_shape[1:])
+        pointmaps_pos = pointmaps_pos[:B]
+        return rays, rays_pos, pointmaps, pointmaps_pos
+    def forward_decoder(self, rays, rays_pos, pointmaps, pointmaps_pos):
+        if self.return_all_blocks:
+            all_blocks = []
+            for blk in self.decoder_blocks:
+                rays, pointmaps = blk(rays, pointmaps, rays_pos, pointmaps_pos)
+                all_blocks.append(rays)
+            return all_blocks
+        else:
+            for blk in self.decoder_blocks:
+                rays, pointmaps = blk(rays, pointmaps, rays_pos, pointmaps_pos)
+            return rays
+    def get_dino_pos(self,dino_features):
+        # dino runs on 14x14 patches
+        # note: assuming we cropped or resized down!
+        dino_H = self.imshape[0]//14
+        dino_W = self.imshape[1]//14
+        dino_pos = self.dino_pos_getter(dino_features.shape[0],dino_H,dino_W,dino_features.device)
+        return dino_pos
+    def forward(self,batch,mode='loss'):
+        # prep for encoders
+        rays = compute_rays(batch) # we are querying the first camera
+        pointmaps_context = batch['input_cams']['pointmaps'] # we are using the other cameras as context
+        input_masks = batch['input_cams']['valid_masks']
+        # run the encoders
+        rays, rays_pos, pointmaps, pointmaps_pos = self.forward_encoders(rays, pointmaps_context,masks=input_masks)
+        ## adding dino features
+        if not self.skip_dino:
+            dino_features = batch['input_cams']['dino_features']
+            dino_features = self.dino_proj(dino_features)
+            if len(dino_features.shape) == 4:
+                dino_features = dino_features.squeeze(1)
+            dino_pos = self.get_dino_pos(dino_features)
+            pointmaps = torch.cat([pointmaps,dino_features],dim=1)
+            pointmaps_pos = torch.cat([pointmaps_pos,dino_pos],dim=1)
+        else:
+            dino_features = None
+            dino_pos = None
+        # decoder
+        rays = self.forward_decoder(rays, rays_pos, pointmaps, pointmaps_pos)
+        pts_pred_dict = self.pts_head(rays, self.imshape)
+        classifier_pred_dict = self.classifier_head(rays, self.imshape)
+        pred_dict = {**pts_pred_dict,**classifier_pred_dict}
+        gt_dict = batch['new_cams']
+        loss_dict = self.criterion(pred_dict, gt_dict)
+        del rays, rays_pos, pointmaps, pointmaps_pos, dino_features, dino_pos, pointmaps_context, input_masks, pts_pred_dict, classifier_pred_dict
+        if mode == 'loss':
+            # delete all the variables that are not needed
+            del pred_dict, gt_dict
+            return loss_dict
+        elif mode == 'viz':
+            return pred_dict, gt_dict, loss_dict
+        else:
+            raise ValueError(f"Invalid mode: {mode}")

readme.md ADDED Viewed

	@@ -0,0 +1,112 @@

+<div align="center", documentation will follow later.
+# RaySt3R: Predicting Novel Depth Maps for Zero-Shot Object Completion
+<a href="https://arxiv.org/abs/2506.05285"><img src='https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=white' alt='arXiv'></a>
+<a href='https://rayst3r.github.io'><img src='https://img.shields.io/badge/Project_Page-Website-green?logo=googlechrome&logoColor=white' alt='Project Page'></a>
+</div>
+<div align="center">
+<img src="assets/overview.png" width="80%" alt="Method overview">
+</div>
+## 📚 Citation
+```bibtex
+@misc{rayst3r,
+          title={RaySt3R: Predicting Novel Depth Maps for Zero-Shot Object Completion},
+          author={Bardienus P. Duisterhof and Jan Oberst and Bowen Wen and Stan Birchfield and Deva Ramanan and Jeffrey Ichnowski},
+          year={2025},
+          eprint={2506.05285},
+          archivePrefix={arXiv},
+          primaryClass={cs.CV},
+          url={https://arxiv.org/abs/2506.05285},
+    }
+```
+## ✅ TO-DOs
+- [x] Inference code
+- [x] Local gradio demo
+- [ ] Huggingface demo
+- [ ] Docker
+- [ ] Training code
+- [ ] Eval code
+- [ ] ViT-S, No-DINO and Pointmap models
+- [ ] Dataset release
+# ⚙️ Installation
+```bash
+mamba create -n rayst3r python=3.11 cmake=3.14.0
+mamba activate rayst3r
+mamba install pytorch torchvision pytorch-cuda=12.4 -c pytorch -c nvidia # change to your version of cuda
+pip install -r requirements.txt
+# compile the cuda kernels for RoPE
+cd extensions/curope/
+python setup.py build_ext --inplace
+cd ../../
+```
+# 🚀 Usage
+The expected input for RaySt3R is a folder with the following structure:
+<pre><code>
+📁 data_dir/
+├── cam2world.pt       # Camera-to-world transformation (PyTorch tensor), 4x4 - eye(4) if not provided
+├── depth.png          # Depth image, uint16 with max 10 meters
+├── intrinsics.pt      # Camera intrinsics (PyTorch tensor), 3x3
+├── mask.png           # Binary mask image
+└── rgb.png            # RGB image
+</code></pre>
+Note the depth image needs to be saved in uint16, normalized to a 0-10 meters range. We provide an example directory in `example_scene`.
+Run RaySt3R with:
+```bash
+python3 eval_wrapper/eval.py example_scene/
+```
+This writes a colored point cloud back into the input directory.
+Optional flags:
+```bash
+--visualize # Spins up a rerun client to visualize predictions and camera posees
+--run_octmae # Novel views sampled with the OctMAE parameters (see paper)
+--set_conf N # Sets confidence threshold to N
+--n_pred_views # Number of predicted views along each axis in a grid, 5--> 22 views total
+--filter_all_masks # Use all masks, point gets rejected if in background for a single mask
+--tsdf # Fits TSDF to depth maps
+```
+# 🧪 Gradio app
+We also provide a gradio app, which uses <a href="https://wangrc.site/MoGePage/">MoGe</a> and <a href="https://github.com/danielgatis/rembg">Rembg</a> to generate 3D from a single image.
+Launch it with:
+```bash
+python app.py
+```
+# 🎛️ Parameter Guide
+Certain applications may benefit from different hyper parameters, here we provide guidance on how to select them.
+#### 🔁 View Sampling
+We sample novel views evenly on a cylindrical equal-area projection of the sphere.
+Customize sampling in <a href="eval_wrapper/sample_poses.py">sample_poses.py</a>. Use --n_pred_views to reduce the total number of views, making inference faster and reduce overlap and artifacts.
+#### 🟢 Confidence Threshold
+You can set the confidence threshold with the --set_conf threshold. As shown in the paper, a higher threshold generally improves accuracy, reduces edge bleeding but also affects completeness.
+#### 🧼 RaySt3R Masks
+On top of what was presented in the paper, we also provide the option to consider all predicted masks for each point. I.e., for any point, if any of the predicted masks classifies them as background the point gets removed.
+In our limited testing this led to cleaner predictions, but it ocasinally carves out crucial parts of geometry.
+# 🏋️ Training
+The RaySt3R training command is provided in <a href="xps/train_rayst3r.py">train_rayst3r.py</a>, documentation will follow later.

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+matplotlib
+numpy
+open3d
+Pillow
+pyrender
+rerun
+setuptools
+tqdm
+trimesh
+huggingface-hub
+wandb
+einops
+# for app.py
+onnxruntime
+gradio
+rembg
+git+https://github.com/microsoft/MoGe.git

utils/augmentations.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import random
+import torch
+import torch.nn.functional as F
+from abc import ABC, abstractmethod
+from torchvision.transforms import GaussianBlur
+from utils.batch_prep import compute_pointmaps
+import imgaug as ia
+import imgaug.augmenters as iaa
+import numpy as np
+class ChangeBright(torch.nn.Module):
+  def __init__(self,prob=0.5,mag=[0.5,2.0]):
+    super().__init__()
+    self.mag = mag
+    self.prob = prob
+  def forward(self,rgb):
+    #if np.random.uniform()>=self.prob:
+      #return rgb
+    n = rgb.shape[0]
+    apply_aug = np.random.uniform(0,1,size=n) < self.prob
+    aug = iaa.MultiplyBrightness(np.random.uniform(self.mag[0],self.mag[1]))  #NOTE iaa has bug about deterministic, we sample ourselves
+    rgb[apply_aug] = aug(images=rgb[apply_aug])
+    return rgb
+class ChangeContrast(torch.nn.Module):
+  def __init__(self,prob=0.5,mag=[0.5,2.0]):
+    self.mag = mag
+    self.prob = prob
+  def __call__(self,rgb):
+    n = rgb.shape[0]
+    apply_aug = np.random.uniform(0,1,size=n) < self.prob
+    aug = iaa.GammaContrast(np.random.uniform(self.mag[0],self.mag[1]))
+    rgb[apply_aug] = aug(images=rgb[apply_aug])
+    return rgb
+class SaltAndPepper:
+  def __init__(self, prob=0.3, ratio=0.1, per_channel=True):
+    self.prob = prob
+    self.ratio = ratio
+    self.per_channel = per_channel
+  def __call__(self, rgb):
+    n = rgb.shape[0]
+    apply_aug = np.random.uniform(0,1,size=n) < self.prob
+    aug = iaa.SaltAndPepper(self.ratio, per_channel=self.per_channel).to_deterministic()
+    rgb[apply_aug] = aug(images=rgb[apply_aug])
+    return rgb
+class RGBGaussianNoise:
+  def __init__(self, max_noise=10, prob=0.5):
+    self.max_noise = max_noise
+    self.prob = prob
+  def __call__(self, rgb):
+    n = rgb.shape[0]
+    apply_aug = np.random.uniform(0,1,size=n) < self.prob
+    shape = rgb.shape
+    noise = np.random.normal(0, self.max_noise, size=shape).clip(-self.max_noise, self.max_noise)
+    rgb[apply_aug] = (rgb[apply_aug].astype(float) + noise[apply_aug]).clip(0,255).astype(np.uint8)
+    return rgb
+# from https://github.com/mihdalal/manipgen/blob/master/manipgen/utils/obs_utils.py
+class DepthWarping(torch.nn.Module):
+    def __init__(self, std=0.5, prob=0.8):
+        super().__init__()
+        self.std = std
+        self.prob = prob
+    def forward(self, depths, device=None):
+        if device is None:
+            device = depths.device
+        n, _, h, w = depths.shape
+        # Generate Gaussian shifts
+        gaussian_shifts = torch.normal(mean=0, std=self.std, size=(n, h, w, 2), device=device).float()
+        apply_shifts = torch.rand(n, device=device) < self.prob
+        gaussian_shifts[~apply_shifts] = 0.0
+        # Create grid for the original coordinates
+        xx = torch.linspace(0, w - 1, w, device=device)
+        yy = torch.linspace(0, h - 1, h, device=device)
+        xx = xx.unsqueeze(0).repeat(h, 1)
+        yy = yy.unsqueeze(1).repeat(1, w)
+        grid = torch.stack((xx, yy), 2).unsqueeze(0)  # Add batch dimension
+        # Apply Gaussian shifts to the grid
+        grid = grid + gaussian_shifts
+        # Normalize grid values to the range [-1, 1] for grid_sample
+        grid[..., 0] = (grid[..., 0] / (w - 1)) * 2 - 1
+        grid[..., 1] = (grid[..., 1] / (h - 1)) * 2 - 1
+        # Perform the remapping using grid_sample
+        depth_interp = F.grid_sample(depths, grid, mode='bilinear', padding_mode='border', align_corners=True)
+        # Remove the batch and channel dimensions
+        depth_interp = depth_interp.squeeze(0).squeeze(0)
+        return depth_interp
+class DepthHoles(torch.nn.Module):
+    def __init__(self, prob=0.5, kernel_size_lower=3, kernel_size_upper=27, sigma_lower=1.0,
+    sigma_upper=7.0, thresh_lower=0.6, thresh_upper=0.9):
+        super().__init__()
+        self.prob = prob
+        self.kernel_size_lower = kernel_size_lower
+        self.kernel_size_upper = kernel_size_upper
+        self.sigma_lower = sigma_lower
+        self.sigma_upper = sigma_upper
+        self.thresh_lower = thresh_lower
+        self.thresh_upper = thresh_upper
+    def forward(self, depths, device=None):
+        if device is None:
+            device = depths.device
+        n, _, h, w = depths.shape
+        # generate random noise
+        noise = torch.rand(n, 1, h, w, device=device)
+        # apply gaussian blur
+        k = random.choice(list(range(self.kernel_size_lower, self.kernel_size_upper+1, 2)))
+        noise = GaussianBlur(kernel_size=k, sigma=(self.sigma_lower, self.sigma_upper))(noise)
+        # normalize noise
+        noise = (noise - noise.min()) / (noise.max() - noise.min())
+        # apply thresholding
+        thresh = torch.rand(n, 1, 1, 1, device=device) * (self.thresh_upper - self.thresh_lower) + self.thresh_lower
+        mask = (noise > thresh)
+        prob = self.prob
+        keep_mask = torch.rand(n, device=device) < prob
+        mask[~keep_mask, :] = 0
+        return mask
+class DepthNoise(torch.nn.Module):
+    def __init__(self, std=0.005,prob=1.0):
+        super().__init__()
+        self.std = std
+        self.prob = prob
+    def forward(self, depths, device=None):
+        if device is None:
+            device = depths.device
+        n, _, h, w = depths.shape
+        apply_noise = torch.rand(n, device=device) < self.prob
+        noise = torch.randn(n, 1, h, w, device=device) * self.std
+        noise[~apply_noise] = 0.0
+        return depths + noise
+class Augmentor(torch.nn.Module):
+    def __init__(self, depth_holes=DepthHoles(), depth_warping=DepthWarping(),depth_noise=DepthNoise(),
+    rgb_operators=[ChangeBright(),SaltAndPepper(),ChangeContrast(),RGBGaussianNoise()]):
+        super().__init__()
+        self.depth_holes = depth_holes
+        self.depth_warping = depth_warping
+        self.depth_noise = depth_noise
+        self.rgb_operators = rgb_operators
+    def forward(self, batch):
+        input_depths = batch['input_cams']['depths']
+        if self.depth_holes.prob > 0:
+            masks = self.depth_holes(input_depths)
+            batch['input_cams']['valid_masks'][masks] = False
+        #if self.depth_warping.prob > 0:
+            #input_depths = self.depth_warping(input_depths)
+        if self.depth_noise.prob > 0:
+            input_depths = self.depth_noise(input_depths)
+        input_rgbs = batch['input_cams']['imgs'].squeeze(1).cpu().numpy() # this is a bit inefficient, but it's ok..
+        for op in self.rgb_operators:
+            input_rgbs = op(input_rgbs)
+        batch['input_cams']['imgs'] = torch.from_numpy(input_rgbs).cuda().unsqueeze(1)
+        batch['input_cams']['depths'] = input_depths
+        batch['input_cams']['pointmaps'] = compute_pointmaps(batch['input_cams']['depths'],batch['input_cams']['Ks'],batch['input_cams']['c2ws']) # now we're doing this twice, but alas
+        return batch

utils/batch_prep.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import torch
+import torchvision.transforms as tvf
+dino_patch_size = 14
+def batch_to_device(batch,device='cuda'):
+    for key in batch:
+        if isinstance(batch[key],torch.Tensor):
+            batch[key] = batch[key].to(device)
+        elif isinstance(batch[key],dict):
+            batch[key] = batch_to_device(batch[key],device)
+    return batch
+def compute_pointmap(depth: torch.Tensor, intrinsics: torch.Tensor, cam2world: torch.Tensor = None) -> torch.Tensor:
+    fx, fy = intrinsics[0, 0], intrinsics[1, 1]
+    cx, cy = intrinsics[0, 2], intrinsics[1, 2]
+    h, w = depth.shape
+    i, j = torch.meshgrid(torch.arange(w), torch.arange(h), indexing='xy')
+    i = i.to(depth.device)
+    j = j.to(depth.device)
+    x_cam = (i - cx) * depth / fx
+    y_cam = (j - cy) * depth / fy
+    points_cam = torch.stack([x_cam, y_cam, depth], axis=-1)
+    if cam2world is not None:
+        points_cam = torch.matmul(cam2world[:3, :3], points_cam.reshape(-1, 3).T).T + cam2world[:3, 3]
+    points_cam = points_cam.reshape(h, w, 3)
+    return points_cam
+def compute_pointmaps(depths: torch.Tensor, intrinsics: torch.Tensor, cam2worlds: torch.Tensor) -> torch.Tensor:
+    pointmaps = []
+    depth_shape = depths.shape
+    pointmaps_shape = depths.shape + (3,)
+    for depth, K, c2w in zip(depths, intrinsics, cam2worlds):
+        n_views = depth.shape[0]
+        for i in range(n_views):
+            pointmaps.append(compute_pointmap(depth[i], K[i],c2w[i]))
+    return torch.stack(pointmaps).reshape(pointmaps_shape)
+def depth_to_metric(depth):
+    # depth: shape H x W
+    # we want to convert the depth to a metric depth
+    depth_max = 10.0
+    depth_scaled = depth_max * (depth / 65535.0)
+    return depth_scaled
+def make_rgb_transform() -> tvf.Compose:
+    return tvf.Compose([
+        #tvf.ToTensor(),
+        #lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255
+        tvf.Normalize(
+            mean=(123.675, 116.28, 103.53),
+            std=(58.395, 57.12, 57.375),
+        ),
+    ])
+rgb_transform = make_rgb_transform()
+def compute_dino_and_store_features(dino_model : torch.nn.Module, rgb: torch.Tensor, mask: torch.Tensor,dino_layers: list[int] = None) -> torch.Tensor:
+    """Computes the DINO features given an RGB image."""
+    rgb = rgb.squeeze(1)
+    mask = mask.squeeze(1)
+    rgb = rgb.permute(0,3,1,2)
+    mask = mask.unsqueeze(1).repeat(1,3,1,1)
+    rgb = rgb * mask
+    rgb = rgb.float()
+    H, W = rgb.shape[-2:]
+    goal_H, goal_W = H//dino_patch_size*dino_patch_size, W//dino_patch_size*dino_patch_size
+    resize_transform = tvf.CenterCrop([goal_H, goal_W])
+    with torch.no_grad():
+        rgb = resize_transform(rgb)
+        rgb = rgb_transform(rgb)
+        all_feat = dino_model.get_intermediate_layers(rgb, dino_layers)
+        dino_feat = torch.cat(all_feat, dim=-1)
+    return dino_feat
+def prepare_fast_batch(batch,dino_model = None,dino_layers = None):
+    # depth to metric
+    batch['new_cams']['depths'] = depth_to_metric(batch['new_cams']['depths'])
+    batch['input_cams']['depths'] = depth_to_metric(batch['input_cams']['depths'])
+    # compute pointmaps
+    batch['new_cams']['pointmaps'] = compute_pointmaps(batch['new_cams']['depths'],batch['new_cams']['Ks'],batch['new_cams']['c2ws'])
+    batch['input_cams']['pointmaps'] = compute_pointmaps(batch['input_cams']['depths'],batch['input_cams']['Ks'],batch['input_cams']['c2ws'])
+    # compute dino features
+    if dino_model is not None and len(dino_layers) > 0:
+        batch['input_cams']['dino_features'] = compute_dino_and_store_features(dino_model,batch['input_cams']['imgs'],batch['input_cams']['valid_masks'],dino_layers)
+    return batch
+def normalize_batch(batch,normalize_mode):
+    scale_factors = []
+    if normalize_mode == 'None':
+        pass
+    elif normalize_mode == 'median':
+        B = batch['input_cams']['valid_masks'].shape[0]
+        for b in range(B):
+            input_mask = batch['input_cams']['valid_masks'][b]
+            depth_median = batch['input_cams']['depths'][b][input_mask].median()
+            scale_factor = 1.0 / depth_median
+            scale_factors.append(scale_factor)
+            batch['input_cams']['depths'][b] = scale_factor * batch['input_cams']['depths'][b]
+            batch['input_cams']['pointmaps'][b] = scale_factor * batch['input_cams']['pointmaps'][b]
+            batch['input_cams']['c2ws'][b][0,:3,-1] = scale_factor * batch['input_cams']['c2ws'][b][0,:3,-1]
+            batch['new_cams']['depths'][b] = scale_factor * batch['new_cams']['depths'][b]
+            batch['new_cams']['pointmaps'][b] = scale_factor * batch['new_cams']['pointmaps'][b]
+            batch['new_cams']['c2ws'][b][:,:3,-1] = scale_factor * batch['new_cams']['c2ws'][b][:,:3,-1]
+    return batch, scale_factors
+def denormalize_batch(batch,pred,gt,scale_factors):
+    B = len(scale_factors)
+    n_new_cams = batch['new_cams']['c2ws'].shape[1]
+    for b in range(B):
+        new_scale_factor = 1.0 / scale_factors[b]
+        batch['input_cams']['depths'][b] = new_scale_factor * batch['input_cams']['depths'][b]
+        batch['input_cams']['pointmaps'][b] = new_scale_factor * batch['input_cams']['pointmaps'][b]
+        batch['input_cams']['c2ws'][b][:,:3,-1] = new_scale_factor * batch['input_cams']['c2ws'][b][:,:3,-1]
+        batch['new_cams']['depths'][b] = new_scale_factor * batch['new_cams']['depths'][b]
+        batch['new_cams']['pointmaps'][b] = new_scale_factor * batch['new_cams']['pointmaps'][b]
+        batch['new_cams']['c2ws'][b][:,:3,-1] = new_scale_factor * batch['new_cams']['c2ws'][b][:,:3,-1]
+        pred['depths'][b] = new_scale_factor * pred['depths'][b]
+        gt['c2ws'][b][:,:3,-1] = new_scale_factor * gt['c2ws'][b][:,:3,-1]
+        gt['depths'][b] = new_scale_factor * gt['depths'][b]
+        gt['pointmaps'][b] = compute_pointmaps(gt['depths'][b].unsqueeze(1),gt['Ks'][b].unsqueeze(1),gt['c2ws'][b].unsqueeze(1)).squeeze(1)
+        pred['pointmaps'][b] = compute_pointmaps(pred['depths'][b].unsqueeze(1),gt['Ks'][b].unsqueeze(1),gt['c2ws'][b].unsqueeze(1)).squeeze(1)
+    return batch, pred, gt

utils/collate.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import torch
+def collate(batch):
+    if isinstance(batch[0],dict):
+        return {k: collate([d[k] for d in batch]) for k in batch[0].keys()}
+    else:
+        return torch.stack([torch.stack(t) for t in batch])

utils/eval.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+def eval_pred(pred_dict, gt_dict,accuracy_tresh=[0.001,0.01,0.02,0.05,0.1,0.5]):
+    pointmaps_pred = pred_dict['pointmaps']
+    pointmaps_gt = gt_dict['pointmaps']
+    mask = gt_dict['valid_masks'].unsqueeze(-1).repeat(1,1,1,3)
+    points_pred = pointmaps_pred[mask].reshape(-1,3)
+    points_gt = pointmaps_gt[mask].reshape(-1,3)
+    dists = torch.norm(points_pred - points_gt, dim=1)
+    results = {'dist':dists.mean().detach().item()}
+    if 'classifier' in pred_dict:
+        classifier_pred = (torch.sigmoid(pred_dict['classifier']) > 0.5).bool()
+        classifier_gt = gt_dict['valid_masks']
+        results['classifier_acc'] = (classifier_pred == classifier_gt).float().mean().detach().item()
+    for tresh in accuracy_tresh:
+        acc = (dists < tresh).float().mean()
+        results[f'acc_{tresh}'] = acc.detach().item()
+    return results

utils/fusion.py ADDED Viewed

	@@ -0,0 +1,476 @@

+# Copyright (c) 2018 Andy Zeng
+import numpy as np
+import torch
+from numba import njit, prange
+from skimage import measure
+try:
+  import pycuda.driver as cuda
+  import pycuda.autoinit
+  from pycuda.compiler import SourceModule
+  FUSION_GPU_MODE = 1
+except Exception as err:
+  print('Warning: {}'.format(err))
+  print('Failed to import PyCUDA. Running fusion in CPU mode.')
+  FUSION_GPU_MODE = 0
+class TSDFVolume:
+  """Volumetric TSDF Fusion of RGB-D Images.
+  """
+  def __init__(self, vol_bnds, voxel_size, use_gpu=True):
+    """Constructor.
+    Args:
+      vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the
+        xyz bounds (min/max) in meters.
+      voxel_size (float): The volume discretization in meters.
+    """
+    vol_bnds = np.asarray(vol_bnds)
+    assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)."
+    # Define voxel volume parameters
+    self._vol_bnds = vol_bnds
+    self._voxel_size = float(voxel_size)
+    self._trunc_margin = 5 * self._voxel_size  # truncation on SDF
+    self._color_const = 256 * 256
+    # Adjust volume bounds and ensure C-order contiguous
+    self._vol_dim = np.ceil((self._vol_bnds[:,1]-self._vol_bnds[:,0])/self._voxel_size).copy(order='C').astype(int)
+    self._vol_bnds[:,1] = self._vol_bnds[:,0]+self._vol_dim*self._voxel_size
+    self._vol_origin = self._vol_bnds[:,0].copy(order='C').astype(np.float32)
+    print("Voxel volume size: {} x {} x {} - # points: {:,}".format(
+      self._vol_dim[0], self._vol_dim[1], self._vol_dim[2],
+      self._vol_dim[0]*self._vol_dim[1]*self._vol_dim[2])
+    )
+    # Initialize pointers to voxel volume in CPU memory
+    self._tsdf_vol_cpu = np.ones(self._vol_dim).astype(np.float32)
+    # for computing the cumulative moving average of observations per voxel
+    self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+    self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+    #self.gpu_mode = False # CPU for debugging!!
+    self.gpu_mode = use_gpu and FUSION_GPU_MODE
+    # Copy voxel volumes to GPU
+    if self.gpu_mode:
+      self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
+      cuda.memcpy_htod(self._tsdf_vol_gpu,self._tsdf_vol_cpu)
+      self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
+      cuda.memcpy_htod(self._weight_vol_gpu,self._weight_vol_cpu)
+      self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
+      cuda.memcpy_htod(self._color_vol_gpu,self._color_vol_cpu)
+      # Cuda kernel function (C++)
+      self._cuda_src_mod = SourceModule("""
+        __global__ void integrate(float * tsdf_vol,
+                                  float * weight_vol,
+                                  float * color_vol,
+                                  float * vol_dim,
+                                  float * vol_origin,
+                                  float * cam_intr,
+                                  float * cam_pose,
+                                  float * other_params,
+                                  float * color_im,
+                                  float * depth_im) {
+          // Get voxel index
+          int gpu_loop_idx = (int) other_params[0];
+          int max_threads_per_block = blockDim.x;
+          int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
+          int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x;
+          int vol_dim_x = (int) vol_dim[0];
+          int vol_dim_y = (int) vol_dim[1];
+          int vol_dim_z = (int) vol_dim[2];
+          if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z)
+              return;
+          // Get voxel grid coordinates (note: be careful when casting)
+          float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z)));
+          float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z));
+          float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z);
+          // Voxel grid coordinates to world coordinates
+          float voxel_size = other_params[1];
+          float pt_x = vol_origin[0]+voxel_x*voxel_size;
+          float pt_y = vol_origin[1]+voxel_y*voxel_size;
+          float pt_z = vol_origin[2]+voxel_z*voxel_size;
+          // World coordinates to camera coordinates
+          float tmp_pt_x = pt_x-cam_pose[0*4+3];
+          float tmp_pt_y = pt_y-cam_pose[1*4+3];
+          float tmp_pt_z = pt_z-cam_pose[2*4+3];
+          float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z;
+          float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z;
+          float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z;
+          // Camera coordinates to image pixels
+          int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]);
+          int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]);
+          // Skip if outside view frustum
+          int im_h = (int) other_params[2];
+          int im_w = (int) other_params[3];
+          if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0)
+              return;
+          // Skip invalid depth
+          float depth_value = depth_im[pixel_y*im_w+pixel_x];
+          if (depth_value == 0)
+              return;
+          // Integrate TSDF
+          float trunc_margin = other_params[4];
+          float depth_diff = depth_value-cam_pt_z;
+          if (depth_diff < -trunc_margin)
+              return;
+          float dist = fmin(1.0f,depth_diff/trunc_margin);
+          float w_old = weight_vol[voxel_idx];
+          float obs_weight = other_params[5];
+          float w_new = w_old + obs_weight;
+          weight_vol[voxel_idx] = w_new;
+          tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new;
+          // Integrate color
+          float old_color = color_vol[voxel_idx];
+          float old_b = floorf(old_color/(256*256));
+          float old_g = floorf((old_color-old_b*256*256)/256);
+          float old_r = old_color-old_b*256*256-old_g*256;
+          float new_color = color_im[pixel_y*im_w+pixel_x];
+          float new_b = floorf(new_color/(256*256));
+          float new_g = floorf((new_color-new_b*256*256)/256);
+          float new_r = new_color-new_b*256*256-new_g*256;
+          new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f);
+          new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f);
+          new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f);
+          color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r;
+        }""")
+      self._cuda_integrate = self._cuda_src_mod.get_function("integrate")
+      # Determine block/grid size on GPU
+      gpu_dev = cuda.Device(0)
+      self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
+      n_blocks = int(np.ceil(float(np.prod(self._vol_dim))/float(self._max_gpu_threads_per_block)))
+      grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X,int(np.floor(np.cbrt(n_blocks))))
+      grid_dim_y = min(gpu_dev.MAX_GRID_DIM_Y,int(np.floor(np.sqrt(n_blocks/grid_dim_x))))
+      grid_dim_z = min(gpu_dev.MAX_GRID_DIM_Z,int(np.ceil(float(n_blocks)/float(grid_dim_x*grid_dim_y))))
+      self._max_gpu_grid_dim = np.array([grid_dim_x,grid_dim_y,grid_dim_z]).astype(int)
+      self._n_gpu_loops = int(np.ceil(float(np.prod(self._vol_dim))/float(np.prod(self._max_gpu_grid_dim)*self._max_gpu_threads_per_block)))
+    else:
+      # Get voxel grid coordinates
+      xv, yv, zv = np.meshgrid(
+        range(self._vol_dim[0]),
+        range(self._vol_dim[1]),
+        range(self._vol_dim[2]),
+        indexing='ij'
+      )
+      self.vox_coords = np.concatenate([
+        xv.reshape(1,-1),
+        yv.reshape(1,-1),
+        zv.reshape(1,-1)
+      ], axis=0).astype(int).T
+  @staticmethod
+  @njit(parallel=True)
+  def vox2world(vol_origin, vox_coords, vox_size):
+    """Convert voxel grid coordinates to world coordinates.
+    """
+    vol_origin = vol_origin.astype(np.float32)
+    vox_coords = vox_coords.astype(np.float32)
+    cam_pts = np.empty_like(vox_coords, dtype=np.float32)
+    for i in prange(vox_coords.shape[0]):
+      for j in range(3):
+        cam_pts[i, j] = vol_origin[j] + (vox_size * vox_coords[i, j])
+    return cam_pts
+  @staticmethod
+  @njit(parallel=True)
+  def cam2pix(cam_pts, intr):
+    """Convert camera coordinates to pixel coordinates.
+    """
+    intr = intr.astype(np.float32)
+    fx, fy = intr[0, 0], intr[1, 1]
+    cx, cy = intr[0, 2], intr[1, 2]
+    pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64)
+    for i in prange(cam_pts.shape[0]):
+      pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx))
+      pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy))
+    return pix
+  @staticmethod
+  @njit(parallel=True)
+  def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight):
+    """Integrate the TSDF volume.
+    """
+    tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32)
+    w_new = np.empty_like(w_old, dtype=np.float32)
+    for i in prange(len(tsdf_vol)):
+      w_new[i] = w_old[i] + obs_weight
+      tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i]
+    return tsdf_vol_int, w_new
+  def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.,mask=None):
+    """Integrate an RGB-D frame into the TSDF volume.
+    Args:
+      color_im (ndarray): An RGB image of shape (H, W, 3).
+      depth_im (ndarray): A depth image of shape (H, W).
+      cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3).
+      cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4).
+      obs_weight (float): The weight to assign for the current observation. A higher
+        value
+    """
+    im_h, im_w = depth_im.shape
+    # Fold RGB color image into a single channel image
+    color_im = color_im.astype(np.float32)
+    color_im = np.floor(color_im[...,2]*self._color_const + color_im[...,1]*256 + color_im[...,0])
+    if self.gpu_mode:  # GPU mode: integrate voxel volume (calls CUDA kernel)
+      # no mask implemented yet
+      for gpu_loop_idx in range(self._n_gpu_loops):
+        self._cuda_integrate(self._tsdf_vol_gpu,
+                            self._weight_vol_gpu,
+                            self._color_vol_gpu,
+                            cuda.InOut(self._vol_dim.astype(np.float32)),
+                            cuda.InOut(self._vol_origin.astype(np.float32)),
+                            cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
+                            cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
+                            cuda.InOut(np.asarray([
+                              gpu_loop_idx,
+                              self._voxel_size,
+                              im_h,
+                              im_w,
+                              self._trunc_margin,
+                              obs_weight
+                            ], np.float32)),
+                            cuda.InOut(color_im.reshape(-1).astype(np.float32)),
+                            cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
+                            block=(self._max_gpu_threads_per_block,1,1),
+                            grid=(
+                              int(self._max_gpu_grid_dim[0]),
+                              int(self._max_gpu_grid_dim[1]),
+                              int(self._max_gpu_grid_dim[2]),
+                            )
+        )
+    else:  # CPU mode: integrate voxel volume (vectorized implementation)
+      # Convert voxel grid coordinates to pixel coordinates
+      cam_pts = self.vox2world(self._vol_origin, self.vox_coords, self._voxel_size)
+      cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose))
+      pix_z = cam_pts[:, 2]
+      pix = self.cam2pix(cam_pts, cam_intr)
+      pix_x, pix_y = pix[:, 0], pix[:, 1]
+      # Eliminate pixels outside view frustum
+      valid_pix = np.logical_and(pix_x >= 0,
+                  np.logical_and(pix_x < im_w,
+                  np.logical_and(pix_y >= 0,
+                  np.logical_and(pix_y < im_h,
+                  pix_z > 0))))
+      if mask is not None:
+        mask_queries = mask[pix_y[valid_pix],pix_x[valid_pix]]
+        valid_pix[valid_pix] = np.logical_and(valid_pix[valid_pix],mask_queries)
+      depth_val = np.zeros(pix_x.shape)
+      depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]
+      # Integrate TSDF
+      depth_diff = depth_val - pix_z
+      valid_pts = np.logical_and(depth_val > 0, depth_diff >= -self._trunc_margin)
+      dist = np.minimum(1, depth_diff / self._trunc_margin)
+      valid_vox_x = self.vox_coords[valid_pts, 0]
+      valid_vox_y = self.vox_coords[valid_pts, 1]
+      valid_vox_z = self.vox_coords[valid_pts, 2]
+      w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+      tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+      valid_dist = dist[valid_pts]
+      tsdf_vol_new, w_new = self.integrate_tsdf(tsdf_vals, valid_dist, w_old, obs_weight)
+      self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new
+      self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new
+      # Integrate color
+      old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+      old_b = np.floor(old_color / self._color_const)
+      old_g = np.floor((old_color-old_b*self._color_const)/256)
+      old_r = old_color - old_b*self._color_const - old_g*256
+      new_color = color_im[pix_y[valid_pts],pix_x[valid_pts]]
+      new_b = np.floor(new_color / self._color_const)
+      new_g = np.floor((new_color - new_b*self._color_const) /256)
+      new_r = new_color - new_b*self._color_const - new_g*256
+      new_b = np.minimum(255., np.round((w_old*old_b + obs_weight*new_b) / w_new))
+      new_g = np.minimum(255., np.round((w_old*old_g + obs_weight*new_g) / w_new))
+      new_r = np.minimum(255., np.round((w_old*old_r + obs_weight*new_r) / w_new))
+      self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = new_b*self._color_const + new_g*256 + new_r
+  def get_volume(self):
+    if self.gpu_mode:
+      cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
+      cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
+    return self._tsdf_vol_cpu, self._color_vol_cpu
+  def get_point_cloud(self):
+    """Extract a point cloud from the voxel volume.
+    """
+    tsdf_vol, color_vol = self.get_volume()
+    # Marching cubes
+    verts = measure.marching_cubes(tsdf_vol, level=0, method='lewiner')[0]
+    verts_ind = np.round(verts).astype(int)
+    verts = verts*self._voxel_size + self._vol_origin
+    # Get vertex colors
+    rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+    colors_b = np.floor(rgb_vals / self._color_const)
+    colors_g = np.floor((rgb_vals - colors_b*self._color_const) / 256)
+    colors_r = rgb_vals - colors_b*self._color_const - colors_g*256
+    colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+    colors = colors.astype(np.uint8)
+    pc = np.hstack([verts, colors])
+    return pc
+  def get_mesh(self):
+    """Compute a mesh from the voxel volume using marching cubes.
+    """
+    tsdf_vol, color_vol = self.get_volume()
+    # Marching cubes
+    verts, faces, norms, vals = measure.marching_cubes(tsdf_vol, level=0, method='lewiner')
+    verts_ind = np.round(verts).astype(int)
+    verts = verts*self._voxel_size+self._vol_origin  # voxel grid coordinates to world coordinates
+    # Get vertex colors
+    rgb_vals = color_vol[verts_ind[:,0], verts_ind[:,1], verts_ind[:,2]]
+    colors_b = np.floor(rgb_vals/self._color_const)
+    colors_g = np.floor((rgb_vals-colors_b*self._color_const)/256)
+    colors_r = rgb_vals-colors_b*self._color_const-colors_g*256
+    colors = np.floor(np.asarray([colors_r,colors_g,colors_b])).T
+    colors = colors.astype(np.uint8)
+    return verts, faces, norms, colors
+def rigid_transform(xyz, transform):
+  """Applies a rigid transform to an (N, 3) pointcloud.
+  """
+  xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)])
+  xyz_t_h = np.dot(transform, xyz_h.T).T
+  return xyz_t_h[:, :3]
+def get_view_frustum(depth_im, cam_intr, cam_pose):
+  """Get corners of 3D camera view frustum of depth image
+  """
+  im_h = depth_im.shape[0]
+  im_w = depth_im.shape[1]
+  max_depth = np.max(depth_im)
+  view_frust_pts = np.array([
+    (np.array([0,0,0,im_w,im_w])-cam_intr[0,2])*np.array([0,max_depth,max_depth,max_depth,max_depth])/cam_intr[0,0],
+    (np.array([0,0,im_h,0,im_h])-cam_intr[1,2])*np.array([0,max_depth,max_depth,max_depth,max_depth])/cam_intr[1,1],
+    np.array([0,max_depth,max_depth,max_depth,max_depth])
+  ])
+  view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T
+  return view_frust_pts
+def meshwrite(filename, verts, faces, norms, colors):
+  """Save a 3D mesh to a polygon .ply file.
+  """
+  # Write header
+  ply_file = open(filename,'w')
+  ply_file.write("ply\n")
+  ply_file.write("format ascii 1.0\n")
+  ply_file.write("element vertex %d\n"%(verts.shape[0]))
+  ply_file.write("property float x\n")
+  ply_file.write("property float y\n")
+  ply_file.write("property float z\n")
+  ply_file.write("property float nx\n")
+  ply_file.write("property float ny\n")
+  ply_file.write("property float nz\n")
+  ply_file.write("property uchar red\n")
+  ply_file.write("property uchar green\n")
+  ply_file.write("property uchar blue\n")
+  ply_file.write("element face %d\n"%(faces.shape[0]))
+  ply_file.write("property list uchar int vertex_index\n")
+  ply_file.write("end_header\n")
+  # Write vertex list
+  for i in range(verts.shape[0]):
+    ply_file.write("%f %f %f %f %f %f %d %d %d\n"%(
+      verts[i,0], verts[i,1], verts[i,2],
+      norms[i,0], norms[i,1], norms[i,2],
+      colors[i,0], colors[i,1], colors[i,2],
+    ))
+  # Write face list
+  for i in range(faces.shape[0]):
+    ply_file.write("3 %d %d %d\n"%(faces[i,0], faces[i,1], faces[i,2]))
+  ply_file.close()
+def pcwrite(filename, xyzrgb):
+  """Save a point cloud to a polygon .ply file.
+  """
+  xyz = xyzrgb[:, :3]
+  rgb = xyzrgb[:, 3:].astype(np.uint8)
+  # Write header
+  ply_file = open(filename,'w')
+  ply_file.write("ply\n")
+  ply_file.write("format ascii 1.0\n")
+  ply_file.write("element vertex %d\n"%(xyz.shape[0]))
+  ply_file.write("property float x\n")
+  ply_file.write("property float y\n")
+  ply_file.write("property float z\n")
+  ply_file.write("property uchar red\n")
+  ply_file.write("property uchar green\n")
+  ply_file.write("property uchar blue\n")
+  ply_file.write("end_header\n")
+  # Write vertex list
+  for i in range(xyz.shape[0]):
+    ply_file.write("%f %f %f %d %d %d\n"%(
+      xyz[i, 0], xyz[i, 1], xyz[i, 2],
+      rgb[i, 0], rgb[i, 1], rgb[i, 2],
+    ))
+def get_vol_bds(pred_depths : torch.Tensor, pred_c2ws : torch.Tensor, pred_intr : torch.Tensor):
+  n_views = pred_depths.shape[0]
+  vol_bnds = np.zeros((3,2))
+  for i in range(n_views):
+    intr = pred_intr[i].cpu().numpy()
+    c2w = pred_c2ws[i].cpu().numpy()
+    depth = pred_depths[i].cpu().numpy()
+    view_frust_pts = get_view_frustum(depth, intr, c2w)
+    vol_bnds[:,0] = np.minimum(vol_bnds[:,0], np.amin(view_frust_pts, axis=1))
+    vol_bnds[:,1] = np.maximum(vol_bnds[:,1], np.amax(view_frust_pts, axis=1))
+  return vol_bnds
+def fuse_batch(pred_dict: dict, gt_dict: dict, batch:dict,voxel_size: float = 0.02):
+  pred_depths = pred_dict['pointmaps'][...,-1] # depth here is just z, assuming the predicted point map is in camera frame
+  pred_c2ws = batch['new_cams']['c2ws']
+  pred_intr = batch['new_cams']['Ks']
+  pred_masks = batch['new_cams']['valid_masks']
+  B = pred_depths.shape[0]
+  n_views = pred_depths.shape[1]
+  meshes = []
+  for i in range(B):
+    intrs = pred_intr[i]
+    c2ws = pred_c2ws[i]
+    depths = pred_depths[i]
+    vol_bnds = get_vol_bds(depths, c2ws, intrs)
+    tsdf_vol = TSDFVolume(vol_bnds, voxel_size=voxel_size)
+    masks = pred_masks[i]
+    for j in range(n_views):
+      intr = intrs[j]
+      c2w = c2ws[j]
+      depth = depths[j]
+      mask = masks[j]
+      depth[~mask] = 0
+      img = torch.zeros_like(depth,dtype=torch.uint8).unsqueeze(-1).repeat(1,1,3)
+      img[:,:,-1] = 255
+      tsdf_vol.integrate(img.cpu().numpy(), depth.cpu().numpy(), intr.cpu().numpy(), c2w.cpu().numpy(), obs_weight=1.)
+    verts, faces, norms, colors = tsdf_vol.get_mesh()
+    meshes.append(dict(verts=verts, faces=faces, norms=norms, colors=colors))
+  return meshes

utils/geometry.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import numpy as np
+import torch
+import copy
+from utils.utils import invalid_to_nans, invalid_to_zeros
+def compute_pointmap(depth, cam2w, intrinsics):
+    fx, fy = intrinsics[0, 0], intrinsics[1, 1]
+    cx, cy = intrinsics[0, 2], intrinsics[1, 2]
+    h, w = depth.shape
+    i, j = np.meshgrid(np.arange(w), np.arange(h), indexing='xy')
+    x_cam = (i - cx) * depth / fx
+    y_cam = (j - cy) * depth / fy
+    points_cam = np.stack([x_cam, y_cam, depth], axis=-1)
+    points_world = np.dot(cam2w[:3, :3], points_cam.reshape(-1, 3).T).T + cam2w[:3, 3]
+    points_world = points_world.reshape(h, w, 3)
+    return points_world
+def invert_poses(raw_poses):
+    poses = copy.deepcopy(raw_poses)
+    original_shape = poses.shape
+    poses = poses.reshape(-1, 4, 4)
+    R = copy.deepcopy(poses[:, :3, :3])
+    t = copy.deepcopy(poses[:, :3, 3])
+    poses[:, :3, :3] = R.transpose(1, 2)
+    poses[:, :3, 3] = torch.bmm(-R.transpose(1, 2), t.unsqueeze(-1)).squeeze(-1)
+    poses = poses.reshape(*original_shape)
+    return poses
+def center_pointmaps_set(dict,w2cs):
+    swap_dim = False
+    if dict["pointmaps"].shape[1] == 3:
+        swap_dim = True
+        dict["pointmaps"] = dict["pointmaps"].transpose(1,-1)
+    original_shape = dict["pointmaps"].shape
+    device = dict["pointmaps"].device
+    B = original_shape[0]
+    # recompute pointmaps in camera frame
+    pointmaps = dict["pointmaps"]
+    pointmaps_h = torch.cat([pointmaps,torch.ones(pointmaps.shape[:-1]+(1,)).to(device)],dim=-1)
+    pointmaps_h = pointmaps_h.reshape(B,-1,4)
+    pointmaps_recentered_h = torch.bmm(w2cs,pointmaps_h.transpose(1,2)).transpose(1,2)
+    pointmaps_recentered = pointmaps_recentered_h[...,:3]/pointmaps_recentered_h[...,3:4]
+    pointmaps_recentered = pointmaps_recentered.reshape(*original_shape)
+    # recompute c2ws
+    if "c2ws" in dict:
+        c2ws_recentered = torch.bmm(w2cs,dict["c2ws"].reshape(-1,4,4))
+        c2ws_recentered = c2ws_recentered.reshape(dict["c2ws"].shape)
+        dict["c2ws"] = c2ws_recentered
+    # assign to dict
+    dict["pointmaps"] = pointmaps_recentered
+    if swap_dim:
+        dict["pointmaps"] = dict["pointmaps"].transpose(1,-1)
+    return dict
+def center_pointmaps(batch):
+    original_poses = batch["new_cams"]["c2ws"] # assuming first camera is the one we want to predict
+    w2cs = invert_poses(batch["new_cams"]["c2ws"])
+    batch["new_cams"] = center_pointmaps_set(batch["new_cams"],w2cs)
+    batch["input_cams"] = center_pointmaps_set(batch["input_cams"],w2cs)
+    batch["original_poses"] = original_poses
+    return batch
+def uncenter_pointmaps(pred,gt,batch):
+    original_poses = batch["original_poses"]
+    batch["new_cams"] = center_pointmaps_set(batch["new_cams"],original_poses)
+    batch["input_cams"] = center_pointmaps_set(batch["input_cams"],original_poses)
+    #gt = center_pointmaps_set(gt,original_poses)
+    #pred = center_pointmaps_set(pred,original_poses)
+    return pred, gt, batch
+def compute_rays(batch):
+    h, w = batch["new_cams"]["pointmaps"].shape[-3:-1]
+    B = batch["new_cams"]["pointmaps"].shape[0]
+    device = batch["new_cams"]["pointmaps"].device
+    Ks = batch["new_cams"]["Ks"]
+    i_s, j_s = np.meshgrid(np.arange(w), np.arange(h), indexing='xy')
+    i_s, j_s = torch.tensor(i_s).repeat(B,1,1).to(device), torch.tensor(j_s).repeat(B,1,1).to(device)
+    f_x = Ks[:,0,0].reshape(-1,1,1)
+    f_y = Ks[:,1,1].reshape(-1,1,1)
+    c_x = Ks[:,0,2].reshape(-1,1,1)
+    c_y = Ks[:,1,2].reshape(-1,1,1)
+    # compute rays with z=1
+    x_cam = (i_s - c_x)  / f_x
+    y_cam = (j_s - c_y)  / f_y
+    rays = torch.cat([x_cam.unsqueeze(-1),y_cam.unsqueeze(-1)],dim=-1)
+    return rays
+def normalize_pointcloud(pts1, pts2=None, norm_mode='avg_dis', valid1=None, valid2=None, valid3=None, ret_factor=False,pts3=None):
+    assert pts1.ndim >= 3 and pts1.shape[-1] == 3
+    assert pts2 is None or (pts2.ndim >= 3 and pts2.shape[-1] == 3)
+    norm_mode, dis_mode = norm_mode.split('_')
+    if norm_mode == 'avg':
+        # gather all points together (joint normalization)
+        nan_pts1, nnz1 = invalid_to_zeros(pts1, valid1, ndim=3)
+        nan_pts2, nnz2 = invalid_to_zeros(pts2, valid2, ndim=3) if pts2 is not None else (None, 0)
+        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+        if pts3 is not None:
+            nan_pts3, nnz3 = invalid_to_zeros(pts3, valid3, ndim=3)
+            all_pts = torch.cat((all_pts, nan_pts3), dim=1)
+            nnz1 += nnz3
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+        if dis_mode == 'dis':
+            pass # do nothing
+        elif dis_mode == 'log1p':
+            all_dis = torch.log1p(all_dis)
+        elif dis_mode == 'warp-log1p':
+            # actually warp input points before normalizing them
+            log_dis = torch.log1p(all_dis)
+            warp_factor = log_dis / all_dis.clip(min=1e-8)
+            H1, W1 = pts1.shape[1:-1]
+            pts1 = pts1 * warp_factor[:,:W1*H1].view(-1,H1,W1,1)
+            if pts2 is not None:
+                H2, W2 = pts2.shape[1:-1]
+                pts2 = pts2 * warp_factor[:,W1*H1:].view(-1,H2,W2,1)
+            all_dis = log_dis # this is their true distance afterwards
+        else:
+            raise ValueError(f'bad {dis_mode=}')
+        norm_factor = all_dis.sum(dim=1) / (nnz1 + nnz2 + 1e-8)
+    else:
+        # gather all points together (joint normalization)
+        nan_pts1 = invalid_to_nans(pts1, valid1, ndim=3)
+        nan_pts2 = invalid_to_nans(pts2, valid2, ndim=3) if pts2 is not None else None
+        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+        if norm_mode == 'avg':
+            norm_factor = all_dis.nanmean(dim=1)
+        elif norm_mode == 'median':
+            norm_factor = all_dis.nanmedian(dim=1).values.detach()
+        elif norm_mode == 'sqrt':
+            norm_factor = all_dis.sqrt().nanmean(dim=1)**2
+        else:
+            raise ValueError(f'bad {norm_mode=}')
+    norm_factor = norm_factor.clip(min=1e-8)
+    while norm_factor.ndim < pts1.ndim:
+        norm_factor.unsqueeze_(-1)
+    res = (pts1 / norm_factor,)
+    if pts2 is not None:
+        res = res + (pts2 / norm_factor,)
+    if pts3 is not None:
+        res = res + (pts3 / norm_factor,)
+    if ret_factor:
+        res = res + (norm_factor,)
+    return res
+def compute_pointmap_torch(depth, cam2w, intrinsics,device='cuda'):
+    fx, fy = intrinsics[0, 0], intrinsics[1, 1]
+    cx, cy = intrinsics[0, 2], intrinsics[1, 2]
+    h, w = depth.shape
+    #i, j = np.meshgrid(np.arange(w), np.arange(h), indexing='xy')
+    i, j = torch.meshgrid(torch.arange(w).to(device), torch.arange(h).to(device), indexing='xy')
+    x_cam = (i - cx) * depth / fx
+    y_cam = (j - cy) * depth / fy
+    points_cam = torch.stack([x_cam, y_cam, depth], dim=-1)
+    points_world = (cam2w[:3, :3] @ points_cam.reshape(-1, 3).T).T + cam2w[:3, 3]
+    points_world = points_world.reshape(h, w, 3)
+    return points_world
+def depth2pts(depths, Ks):
+    """
+    Convert depth map to 3D points
+    """
+    device = depths.device
+    B = depths.shape[0]
+    pts = []
+    for b in range(B):
+        depth_b = depths[b]
+        K = Ks[b]
+        pts.append(compute_pointmap_torch(depth_b,torch.eye(4).to(device), K,device))
+    pts = torch.stack(pts, dim=0)
+    return pts

utils/misc.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+import copy
+from pathlib import Path
+import torch
+import torch.distributed as dist
+import numpy as np
+import math
+import socket
+# source: https://github.com/LTH14/mar/blob/main/util/misc.py
+def prep_torch():
+    cpu_cores = get_cpu_cores()
+    torch.set_num_threads(cpu_cores) # intra-op threads (e.g., matrix ops)
+    torch.set_num_interop_threads(cpu_cores) # inter-op parallelism
+    os.environ["OMP_NUM_THREADS"] = str(cpu_cores)
+    os.environ["MKL_NUM_THREADS"] = str(cpu_cores)
+    os.environ["OPENBLAS_NUM_THREADS"] = str(cpu_cores)
+def get_cpu_cores():
+    hostname = socket.gethostname()
+    if "bridges2" in hostname:
+        return int(os.environ["SLURM_JOB_CPUS_PER_NODE"])
+    else:
+        try:
+            with open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", "r") as f:
+                quota = int(f.read().strip())
+            with open("/sys/fs/cgroup/cpu/cpu.cfs_period_us", "r") as f:
+                period = int(f.read().strip())
+            if quota > 0:
+                return max(1, quota // period)
+        except Exception as e:
+            return os.cpu_count()
+def setup_distributed():
+    dist.init_process_group(backend='nccl')
+    # Get the rank of the current process
+    rank = int(os.environ.get('RANK'))
+    world_size = int(os.environ.get('WORLD_SIZE'))
+    local_rank = int(os.environ.get('LOCAL_RANK'))
+    torch.cuda.set_device(local_rank)
+    return rank, world_size, local_rank
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def save_model(args, epoch, model, optimizer, ema_params=None, epoch_name=None):
+    if epoch_name is None:
+        epoch_name = str(epoch)
+    output_dir = Path(args.logdir)
+    checkpoint_path = output_dir / ('checkpoint-%s.pth' % epoch_name)
+    if ema_params is not None:
+        ema_state_dict = copy.deepcopy(model.state_dict())
+        for i, (name, _value) in enumerate(model.named_parameters()):
+            assert name in ema_state_dict
+            ema_state_dict[name] = ema_params[i]
+    else:
+        ema_state_dict = None
+    to_save = {
+        'model': model.state_dict(),
+        'optimizer': optimizer.state_dict(),
+        'epoch': epoch,
+        'args': args,
+        'model_ema': ema_state_dict,
+    }
+    save_on_master(to_save, checkpoint_path)
+def adjust_learning_rate(optimizer, epoch, args):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+    if epoch < args.warmup_epochs:
+        lr = args.lr * epoch / args.warmup_epochs
+    else:
+        lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
+            (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.n_epochs - args.warmup_epochs)))
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+    return lr
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list or 'diffloss' in name:
+            no_decay.append(param)  # no weight decay on bias, norm and diffloss
+        else:
+            decay.append(param)
+    return [
+        {'params': no_decay, 'weight_decay': 0.},
+        {'params': decay, 'weight_decay': weight_decay}]

utils/utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch
+import numpy as np
+def to_tensor(x,dtype=torch.float64):
+    if isinstance(x, torch.Tensor):
+        return x.to(dtype)
+    elif isinstance(x, np.ndarray):
+        return torch.from_numpy(x.copy()).to(dtype)
+    else:
+        raise ValueError(f"Unsupported type: {type(x)}")
+def to_numpy(x):
+    if isinstance(x, torch.Tensor):
+        return x.detach().cpu().numpy()
+    elif isinstance(x, np.ndarray):
+        return x
+    else:
+        raise ValueError(f"Unsupported type: {type(x)}")
+def invalid_to_nans( arr, valid_mask, ndim=999 ):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = float('nan')
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr
+def invalid_to_zeros( arr, valid_mask, ndim=999 ):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = 0
+        nnz = valid_mask.view(len(valid_mask), -1).sum(1)
+    else:
+        nnz = arr.numel() // len(arr) if len(arr) else 0 # number of point per image
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr, nnz
+def scenes_to_batch(scenes,repeat=None):
+    batch = {}
+    n_cams = None
+    if 'new_cams' in scenes:
+        n_cams = scenes['new_cams']['depths'].shape[1]
+        batch['new_cams'], n_cams = scenes_to_batch(scenes['new_cams'])
+        batch['input_cams'],_ = scenes_to_batch(scenes['input_cams'],repeat=n_cams)
+    else:
+        for key in scenes.keys():
+            shape = scenes[key].shape
+            if len(shape) > 3 :
+                n_cams = shape[1]
+                if repeat is not None:
+                    # repeat the 2nd dimension by repeat times to also have the inputs repeated in the batch
+                    repeat_dims = (1,) * len(shape)  # (1,1,1,...) for all dimensions
+                    repeat_dims = list(repeat_dims)
+                    repeat_dims[1] = repeat
+                    batch[key] = scenes[key].repeat(*repeat_dims)
+                    batch[key] = batch[key].reshape(-1, *shape[2:])
+                else:
+                    batch[key] = scenes[key].reshape(-1, *shape[2:])
+            elif key == 'dino_features':
+                repeat_shape = (repeat,) + (1,) * (len(shape) - 1)
+                batch[key] = scenes[key].repeat(*repeat_shape)
+            else:
+                batch[key] = scenes[key]
+    return batch, n_cams
+def dict_to_scenes(input_dict,n_cams):
+    scenes = {}
+    for key in input_dict.keys():
+        if isinstance(input_dict[key],dict):
+            scenes[key] = dict_to_scenes(input_dict[key],n_cams)
+        else:
+            scenes[key] = input_dict[key].reshape(-1, n_cams, *input_dict[key].shape[1:])
+    return scenes
+def batch_to_scenes(pred,gt,batch,n_cams):
+    # pred
+    batch = dict_to_scenes(batch,n_cams)
+    pred = dict_to_scenes(pred,n_cams)
+    gt = dict_to_scenes(gt,n_cams)
+    return pred, gt, batch

utils/viz.py ADDED Viewed

	@@ -0,0 +1,205 @@

+bb = breakpoint
+import torch
+import numpy as np
+from utils.utils import to_tensor, to_numpy
+import open3d as o3d
+import rerun as rr
+OPENCV2OPENGL =  (1,-1,-1,1)
+def pts_to_opengl(pts):
+    return pts*OPENCV2OPENGL[:3]
+def save_pointmaps(data,path='debug',view=False,color='novelty',frustrum_scale=20):
+    # debug function to save points to a ply file
+    import open3d as o3d
+    pointmaps = data['pointmaps']
+    B = pointmaps.shape[0]
+    W, H = pointmaps.shape[-3:-1]
+    n_cams = data['c2ws'].shape[1]
+    geometries = []
+    for b in range(B):
+        geometry_b = []
+        points = torch.cat([p.flatten(start_dim=0,end_dim=1) for p in pointmaps[b]],dim=0)
+        if view:
+            pcd = o3d.geometry.PointCloud()
+            pcd.points = o3d.utility.Vector3dVector(to_numpy(points))
+            if color == 'novelty':
+                colors = torch.ones_like(points)
+                pts_p_cam = W*H
+                # make all novel points red
+                colors[pts_p_cam:,1:]*=0.1
+                # make all points from first camera blue
+                colors[:pts_p_cam,0]*=0.1
+                colors[:pts_p_cam,2]*=0.1
+                colors*=255.0
+            else:
+                colors = torch.cat([p.flatten(start_dim=0,end_dim=1) for p in data['imgs'][b]],dim=0)
+            pcd.colors = o3d.utility.Vector3dVector(to_numpy(colors)/255.0)
+            geometry_b.append(pcd)
+            origin = o3d.geometry.TriangleMesh.create_coordinate_frame(
+                    size=10, origin=[0,0,0])
+            geometry_b.append(origin)
+            for i in range(n_cams):
+                K = data['Ks'][b,i].cpu().numpy()
+                K = o3d.camera.PinholeCameraIntrinsic(W,H,K)
+                P = data['c2ws'][b,i].cpu().numpy()
+                cam_frame = o3d.geometry.LineSet.create_camera_visualization(intrinsic=K,extrinsic=P,scale=frustrum_scale)
+                geometry_b.append(cam_frame)
+            o3d.visualization.draw_geometries(geometry_b)
+            # add point at the origin
+            o3d.io.write_point_cloud(f"{path}_{b}.ply", pcd)
+            breakpoint()
+        geometries.append(geometry_b)
+    return geometries
+def just_load_viz(pred_dict,gt_dict,batch,name='just_load_viz',addr='localhost:9000',fused_meshes=None,n_points=None):
+    rr.init(name)
+    rr.connect(addr)
+    rr.set_time_seconds("stable_time", 0)
+    context_views = batch['input_cams']['pointmaps']
+    context_rgbs = batch['input_cams']['imgs']
+    gt_pred_views = gt_dict['pointmaps']
+    pred_views = pred_dict['pointmaps']
+    # FIX this weird shape
+    pred_masks = batch['new_cams']['valid_masks']
+    context_masks = batch['input_cams']['valid_masks']
+    B = batch['new_cams']['pointmaps'].shape[0]
+    W,H = context_views.shape[-3:-1]
+    n_pred_cams = pred_views.shape[1]
+    for b in range(B):
+        rr.set_time_seconds("stable_time", b)
+        # Set world transform to identity (normal origin)
+        rr.log("world", rr.Transform3D(translation=[0, 0, 0], mat3x3=np.eye(3)))
+        ## show context views
+        context_rgb = to_numpy(context_rgbs[b])
+        for i in range(n_pred_cams):
+            if 'conf_pointmaps' in pred_dict:
+                conf_pts = pred_dict['conf_pointmaps'][b,i]
+                #print(f"view {i} mean conf: {mean_conf}, std conf: {std_conf}")
+                conf_pts = (conf_pts - conf_pts.min())/(conf_pts.max() - conf_pts.min())
+                conf_pts = to_numpy(conf_pts)
+                rr.log(f"view_{i}/pred_conf", rr.Image(conf_pts))
+            if pred_masks[b,i].sum() == 0:
+                continue
+            if gt_pred_views is not None:
+                gt_pred_pts = gt_pred_views[b,i][pred_masks[b,i]]
+                gt_pred_pts = to_numpy(gt_pred_pts)
+            else:
+                gt_pred_pts = None
+            # red is color for gt points
+            if gt_pred_pts is not None:
+                color = np.array([1,0,0])
+                colors = np.ones_like(gt_pred_pts)
+                colors[:,0] = color[0]
+                colors[:,1] = color[1]
+                colors[:,2] = color[2]
+                rr.log(
+                    f"world/new_views_gt/view_{i}", rr.Points3D(gt_pred_pts,colors=colors)
+                )
+            # green is color for pred points
+            pred_pts = pred_views[b,i][pred_masks[b,i]]
+            pred_pts = to_numpy(pred_pts)
+            depth = pred_views[b,i][:,:,2]
+            depth -= depth[pred_masks[b,i]].min()
+            depth[~pred_masks[b,i]] = 0
+            depth /= depth.max()
+            depth = to_numpy(depth)
+            rr.log(f"world/new_views_pred/view_{i}/image", rr.Image(depth))
+            if 'classifier' in pred_dict:
+                classifier = (pred_dict['classifier'][b,i] > 0.0).float() # this is assuming the classifier is a sigmoid output
+                classifier = to_numpy(classifier)
+                rr.log(f"view_{i}/pred_mask", rr.Image(classifier))
+            color = np.array([0,1,0])
+            colors = np.ones_like(pred_pts)
+            colors[:,0] = color[0]
+            colors[:,1] = color[1]
+            colors[:,2] = color[2]
+            if n_points is None:
+                rr.log(
+                    f"world/new_views_pred/view_{i}/pred_points", rr.Points3D(pred_pts,colors=colors)
+                )
+            else:
+                # randomly sample n_points from pred_pts
+                n_points = min(n_points, pred_pts.shape[0])
+                inds = np.random.choice(pred_pts.shape[0], n_points, replace=False)
+                rr.log(
+                    f"world/new_views_pred/view_{i}/pred_points", rr.Points3D(pred_pts[inds],colors=colors[inds])
+                )
+            K = batch['new_cams']['Ks'][b,i].cpu().numpy()
+            P = batch['new_cams']['c2ws'][b,i].cpu().numpy()
+            P = np.linalg.inv(P)
+            rr.log(f"world/new_views_pred/view_{i}", rr.Transform3D(translation=P[:3,3], mat3x3=P[:3,:3], from_parent=True))
+            rr.log(f"world/new_views_gt/view_{i}", rr.Transform3D(translation=P[:3,3], mat3x3=P[:3,:3], from_parent=True))
+            if 'classifier' in pred_dict:
+                classifier = gt_dict['valid_masks'][b,i].float()
+                classifier = to_numpy(classifier)
+                rr.log(f"view_{i}/gt_mask", rr.Image(classifier))
+            rr.log(
+                f"world/new_views_pred/view_{i}/image",
+                rr.Pinhole(
+                    resolution=[H, W],
+                    focal_length=[K[0,0], K[1,1]],
+                    principal_point=[K[0,2], K[1,2]],
+                ),
+            )
+            rr.log(f"world/new_views_pred/view_{i}/image", rr.Image(to_numpy(pred_masks[b,i].float())))
+        n_input_cams = context_masks.shape[1]
+        for i in range(n_input_cams):
+            context_pts = context_views[b][i][context_masks[b][i]]
+            context_pts = to_numpy(context_pts)
+            context_pts_rgb = context_rgbs[b][i][context_masks[b][i]]
+            context_pts_rgb = to_numpy(context_pts_rgb)
+            # depth imgs
+            #context_depths = batch['input_cams']['depths'][b][i]
+            #context_depths  = (context_depths / context_depths.max() * 255.0).clamp(0,255)
+            #context_depths = to_numpy(context_depths).astype(np.uint8)
+            rr.log(
+                f"world/context_views/view_{i}_points", rr.Points3D(context_pts,colors=(context_pts_rgb/255.0))
+            )
+            K = batch['input_cams']['Ks'][b,i].cpu().numpy()
+            P = batch['input_cams']['c2ws'][b,i].cpu().numpy()
+            P = np.linalg.inv(P)
+            rr.log(f"world/context_views/view_{i}", rr.Transform3D(translation=P[:3,3], mat3x3=P[:3,:3], from_parent=True))
+            rr.log(
+                f"world/context_views/view_{i}/image",
+                rr.Pinhole(
+                    resolution=[H, W],
+                    focal_length=[K[0,0], K[1,1]],
+                    principal_point=[K[0,2], K[1,2]],
+                        ),
+                   )
+            context_rgb_i = context_rgb[i]
+            rr.log(
+                f"world/context_views/view_{i}/image", rr.Image(context_rgb_i)
+            )
+            rr.log(
+                f"world/context_camera_{i}/mask", rr.Image(to_numpy(context_masks[b,i].float()))
+            )
+        if fused_meshes is not None:
+            rr.log(f"world/fused_mesh", rr.Mesh3D(vertex_positions=fused_meshes[b]['verts'], vertex_normals=fused_meshes[b]['norms'], vertex_colors=fused_meshes[b]['colors'], triangle_indices=fused_meshes[b]['faces']))

xps/train_rayst3r.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import sys
+import socket
+import os
+# Add the current working directory to the Python path
+current_dir = os.getcwd()
+sys.path.append(current_dir)
+from xps.util import *
+root_log_dir = "logs"
+n_views = 2
+dataset_size = -1
+imshape_input = (480,640)
+imshape_output = (480,640)
+render_size = (480,640)
+preload_train = False
+data_dirs = ["/home/jovyan/shared/bduister/data/processed/","/home/jovyan/shared/bduister/data-2/processed/"]
+dino_features = [4,11,17,23]
+datasets = ['fp_gso','octmae']
+prefetch_dino = False
+normalize_mode = 'median'
+#start_from = "checkpoints/gso_conf.pth"
+start_from = None
+noise_std = 0.005
+view_select_mode = "new_zoom"
+rendered_views_mode = "always"
+dataset_train = f"GenericLoader(size={dataset_size},seed=747,dir={repr(data_dirs)},split='train',datasets={datasets},mode='fast',prefetch_dino={prefetch_dino}," \
++f"dino_features={dino_features},view_select_mode='{view_select_mode}',noise_std={noise_std},rendered_views_mode='{rendered_views_mode}')"
+dataset_test = f"GenericLoader(size=1000,seed=787,dir={repr(data_dirs)},split='test',datasets={datasets},mode='fast',prefetch_dino={prefetch_dino}," \
++f"dino_features={dino_features},view_select_mode='{view_select_mode}',noise_std={noise_std},rendered_views_mode='{rendered_views_mode}')"
+dataset_just_load = f"GenericLoader(size=1000,seed=787,dir={repr(data_dirs)},split='test',datasets={datasets},mode='fast',prefetch_dino={prefetch_dino}," \
++f"dino_features={dino_features},view_select_mode='{view_select_mode}',noise_std={noise_std},rendered_views_mode='{rendered_views_mode}')"
+augmentor = "Augmentor()"
+patch_size = 16
+save_every = 1
+vit="base"
+if vit == "debug":
+    enc_dim = 128
+    dec_dim = 128
+    n_heads = 4
+    enc_depth = 4
+    dec_depth = 4
+    head_n_layers = 1
+    head_dim = 128
+    lr = 3e-4
+    batch_size = 20
+    blr = 1.5e-4
+elif vit == "debug_2":
+    enc_dim = 512
+    dec_dim = 512
+    n_heads = 4
+    enc_depth = 4
+    dec_depth = 10
+    head_n_layers = 1
+    head_dim = 128
+    blr = 1.5e-4
+    batch_size = 18
+elif vit == "small":
+    enc_dim = 384
+    dec_dim = 384
+    n_heads = 6
+    enc_depth = 12
+    dec_depth = 12
+    head_n_layers = 1
+    head_dim = 128
+    batch_size = 6
+    blr = 1.5e-4
+elif vit == "base":
+    enc_dim = 768
+    dec_dim = 768
+    n_heads = 12
+    enc_depth = 4
+    dec_depth = 12
+    head_n_layers = 1
+    head_dim = 128
+    batch_size = 10
+    blr = 1.5e-4
+lambda_classifier = 0.1
+for skip_conf_points in [False]:
+    skip_conf_mask = True
+    model = f"RayQuery(ray_enc=RayEncoder(dim={enc_dim},num_heads={n_heads},depth={enc_depth},img_size={render_size},patch_size={patch_size})," + \
+            f"pointmap_enc=PointmapEncoder(dim={enc_dim},num_heads={n_heads},depth={enc_depth},img_size={render_size},patch_size={patch_size})," + \
+            f"dino_layers={dino_features}," + \
+            f"pts_head_type='dpt_depth'," + \
+            f"classifier_head_type='dpt_mask'," + \
+            f"decoder_dim={dec_dim},decoder_depth={dec_depth},decoder_num_heads={n_heads},imshape={render_size}," + \
+            f"criterion=DepthCompletion(ConfLoss(L21,skip_conf={skip_conf_points}),ConfLoss(ClassifierLoss(BCELoss()),skip_conf={skip_conf_mask}),lambda_classifier={lambda_classifier}),return_all_blocks=True)"
+    key = f"conf_points_{skip_conf_points==False}"
+    key = gen_key(key)
+    logdir = os.path.join(root_log_dir,key)
+    resume=logdir
+    wandb_run_name=key
+    os.makedirs(logdir,exist_ok=True)
+    n_epochs = 20
+    eval_every = 1
+    max_norm = -1
+    OMP_NUM_THREADS=16
+    warmup_epochs = 1
+    executable = f"OMP_NUM_THREADS={OMP_NUM_THREADS} torchrun --nnodes 1 --nproc_per_node $(python -c 'import torch; print(torch.cuda.device_count())') --master_port $((RANDOM%500+29000)) main.py"
+    #executable = f"python main.py"
+    if '--just_load' in sys.argv:
+        batch_size = 5
+        command = f"{executable} --{dataset_train=} --{dataset_test=} --{dataset_just_load=} --{logdir=} --{resume=} --{model=} --{batch_size=} --{normalize_mode=} --{augmentor=}"
+    else:
+        command = f"{executable} --{dataset_train=} --{dataset_test=} --{logdir=} --{n_epochs=} --{resume=} --{normalize_mode=} --{augmentor=} --{warmup_epochs=}"
+        command += f" --{model=} --{eval_every=} --{batch_size=} --{save_every=} --{max_norm=}"
+        command += f" --{blr=}"
+        if start_from is not None:
+            command += f" --{start_from=}"
+        if not '--no_wandb' in sys.argv:
+            command += f" --wandb_project=3dcomplete " + \
+                    f"--{wandb_run_name=}"
+    if len(sys.argv) > 1:
+        for arg in sys.argv[1:]:
+            if not '--no_wandb' in arg:
+                command += f" {arg}"
+    print(command)

xps/util.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import inspect
+import os
+def gen_key(raw_key):
+    # concat the raw_key with the file name that this function is called from
+    current_frame = inspect.currentframe()
+    # Get the caller's frame (the frame that called this function)
+    caller_frame = current_frame.f_back
+    # Extract the filename from the caller's frame
+    caller_file = caller_frame.f_code.co_filename
+    caller_file = os.path.basename(caller_file).replace(".py","")
+    return f"{caller_file}_{raw_key}"