Spaces:

paulengstler
/

invisible-stitch

Running on Zero

App Files Files Community

Paul Engstler commited on Apr 30, 2024

Commit

84eee5b

0 Parent(s):

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
.gitignore +148 -0
README.md +10 -0
app.py +257 -0
examples/photo-1469559845082-95b66baaf023.jpeg +0 -0
examples/photo-1499916078039-922301b0eb9b.jpeg +0 -0
examples/photo-1514984879728-be0aff75a6e8.jpeg +0 -0
examples/photo-1546975490-e8b92a360b24.jpeg +0 -0
examples/photo-1618197345638-d2df92b39fe1.jpeg +0 -0
examples/photo-1628624747186-a941c476b7ef.jpeg +0 -0
examples/photo-1667788000333-4e36f948de9a.jpeg +0 -0
packages.txt +1 -0
pre-requirements.txt +0 -0
requirements.txt +26 -0
utils/demo.py +54 -0
utils/gaussian_renderer/__init__.py +100 -0
utils/gaussian_renderer/network_gui.py +86 -0
utils/gs.py +196 -0
utils/models.py +119 -0
utils/ops.py +95 -0
utils/render.py +112 -0
utils/scene/__init__.py +92 -0
utils/scene/cameras.py +76 -0
utils/scene/colmap_loader.py +294 -0
utils/scene/dataset_readers.py +270 -0
utils/scene/gaussian_model.py +416 -0
utils/scene/utils/camera_utils.py +84 -0
utils/scene/utils/general_utils.py +133 -0
utils/scene/utils/graphics_utils.py +88 -0
utils/scene/utils/image_utils.py +19 -0
utils/scene/utils/loss_utils.py +65 -0
utils/scene/utils/sh_utils.py +118 -0
utils/scene/utils/system_utils.py +28 -0
zoedepth/LICENSE +21 -0
zoedepth/data/__init__.py +24 -0
zoedepth/data/data_mono.py +697 -0
zoedepth/data/ddad.py +117 -0
zoedepth/data/diml_indoor_test.py +125 -0
zoedepth/data/diml_outdoor_test.py +114 -0
zoedepth/data/diode.py +125 -0
zoedepth/data/hypersim.py +138 -0
zoedepth/data/ibims.py +81 -0
zoedepth/data/marigold_nyu.py +112 -0
zoedepth/data/places365.py +118 -0
zoedepth/data/preprocess.py +154 -0
zoedepth/data/sun_rgbd_loader.py +106 -0
zoedepth/data/transforms.py +481 -0
zoedepth/data/vkitti.py +151 -0
zoedepth/data/vkitti2.py +187 -0
zoedepth/models/__init__.py +24 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,148 @@

+*.png
+**.gif
+.vscode/
+*.rdb
+**.xml
+wandb/
+slurm/
+tmp/
+.logs/
+checkpoints/
+external_jobs/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+ptlflow_logs/
+output/
+log/
+.idea/
+# C extensions
+*.so
+results/
+**.DS_Store
+**.pt
+demo/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+~shortcuts/
+**/wandb_logs/
+**.db
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Invisible Stitch
+emoji: 🪡
+colorFrom: pink
+colorTo: purple
+sdk: gradio
+sdk_version: 4.27.0
+app_file: app.py
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import spaces
+import os
+# this is a HF Spaces specific hack, as
+#  (i)  building pytorch3d with GPU support is a bit tricky here
+#  (ii) installing the wheel via requirements.txt breaks ZeroGPU
+os.system("pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt221/download.html")
+import torch
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+import numpy as np
+import skimage
+from PIL import Image
+import gradio as gr
+from utils.render import PointsRendererWithMasks, render
+from utils.ops import snap_high_gradients_to_nn, project_points, get_pointcloud, merge_pointclouds, outpaint_with_depth_estimation
+from utils.gs import gs_options, read_cameras_from_optimization_bundle, Scene, run_gaussian_splatting, get_blank_gs_bundle
+from pytorch3d.utils import opencv_from_cameras_projection
+from utils.ops import focal2fov, fov2focal
+from utils.models import infer_with_zoe_dc
+from utils.scene import GaussianModel
+from utils.demo import downsample_point_cloud
+from typing import Iterable, Tuple, Dict, Optional
+import itertools
+from pytorch3d.structures import Pointclouds
+from pytorch3d.renderer import (
+    look_at_view_transform,
+    PerspectiveCameras,
+)
+from pytorch3d.io import IO
+def get_blank_gs_bundle(h, w):
+    return {
+        "camera_angle_x": focal2fov(torch.tensor([w], dtype=torch.float32), w),
+        "W": w,
+        "H": h,
+        "pcd_points": None,
+        "pcd_colors": None,
+        'frames': [],
+    }
+@spaces.GPU(duration=30)
+def extrapolate_point_cloud(prompt: str, image_size: Tuple[int, int], look_at_params: Iterable[Tuple[float, float, float, Tuple[float, float, float]]], point_cloud: Pointclouds = None, dry_run: bool = False, discard_mask: bool = False, initial_image: Optional[Image.Image] = None, depth_scaling: float = 1, **render_kwargs):
+    w, h = image_size
+    optimization_bundle_frames = []
+    for azim, elev, dist, at in look_at_params:
+        R, T = look_at_view_transform(device=device, azim=azim, elev=elev, dist=dist, at=at)
+        cameras = PerspectiveCameras(R=R, T=T, focal_length=torch.tensor([w], dtype=torch.float32), principal_point=(((h-1)/2, (w-1)/2),), image_size=(image_size,), device=device, in_ndc=False)
+        if point_cloud is not None:
+            images, masks, depths = render(cameras, point_cloud, **render_kwargs)
+            if not dry_run:
+                eroded_mask = skimage.morphology.binary_erosion((depths[0] > 0).cpu().numpy(), footprint=None)#skimage.morphology.disk(1))
+                eroded_depth = depths[0].clone()
+                eroded_depth[torch.from_numpy(eroded_mask).to(depths.device) <= 0] = 0
+                outpainted_img, aligned_depth = outpaint_with_depth_estimation(images[0], masks[0], eroded_depth, h, w, pipe, zoe_dc_model, prompt, cameras, dilation_size=2, depth_scaling=depth_scaling, generator=torch.Generator(device=pipe.device).manual_seed(0))
+                aligned_depth = torch.from_numpy(aligned_depth).to(device)
+            else:
+                # in a dry run, we do not actually outpaint the image
+                outpainted_img = Image.fromarray((255*images[0].cpu().numpy()).astype(np.uint8))
+        else:
+            assert initial_image is not None
+            assert not dry_run
+            # jumpstart the point cloud with a regular depth estimation
+            t_initial_image = torch.from_numpy(np.asarray(initial_image)/255.).permute(2,0,1).float()
+            depth = aligned_depth = infer_with_zoe_dc(zoe_dc_model, t_initial_image, torch.zeros(h, w))
+            outpainted_img = initial_image
+            images = [t_initial_image.to(device)]
+            masks = [torch.ones(h, w, dtype=torch.bool).to(device)]
+        if not dry_run:
+            # snap high gradients to nearest neighbor, which eliminates noodle artifacts
+            aligned_depth = snap_high_gradients_to_nn(aligned_depth, threshold=12).cpu()
+            xy_depth_world = project_points(cameras, aligned_depth)
+        c2w = cameras.get_world_to_view_transform().get_matrix()[0]
+        optimization_bundle_frames.append({
+            "image": outpainted_img,
+            "mask": masks[0].cpu().numpy(),
+            "transform_matrix": c2w.tolist(),
+            "azim": azim,
+            "elev": elev,
+            "dist": dist,
+        })
+        if discard_mask:
+            optimization_bundle_frames[-1].pop("mask")
+        if not dry_run:
+            optimization_bundle_frames[-1]["center_point"] = xy_depth_world[0].mean(dim=0).tolist()
+            optimization_bundle_frames[-1]["depth"] = aligned_depth.cpu().numpy()
+            optimization_bundle_frames[-1]["mean_depth"] = aligned_depth.mean().item()
+        else:
+            # in a dry run, we do not modify the point cloud
+            continue
+        rgb = (torch.from_numpy(np.asarray(outpainted_img).copy()).reshape(-1, 3).float() / 255).to(device)
+        if point_cloud is None:
+            point_cloud = get_pointcloud(xy_depth_world[0], device=device, features=rgb)
+        else:
+            # pytorch 3d's mask might be slightly too big (subpixels), so we erode it a little to avoid seams
+            # in theory, 1 pixel is sufficient but we use 2 to be safe
+            masks[0] = torch.from_numpy(skimage.morphology.binary_erosion(masks[0].cpu().numpy(), footprint=skimage.morphology.disk(2))).to(device)
+            partial_outpainted_point_cloud = get_pointcloud(xy_depth_world[0][~masks[0].view(-1)], device=device, features=rgb[~masks[0].view(-1)])
+            point_cloud = merge_pointclouds([point_cloud, partial_outpainted_point_cloud])
+    return optimization_bundle_frames, point_cloud
+@spaces.GPU(duration=30)
+def generate_point_cloud(initial_image: Image.Image, prompt: str):
+    image_size = initial_image.size
+    w, h = image_size
+    optimization_bundle = get_blank_gs_bundle(h, w)
+    step_size = 25
+    azim_steps = [0, step_size, -step_size]
+    look_at_params = [(azim, 0, 0.01, torch.zeros((1, 3))) for azim in azim_steps]
+    optimization_bundle["frames"], point_cloud = extrapolate_point_cloud(prompt, image_size, look_at_params, discard_mask=True, initial_image=initial_image, depth_scaling=0.5, fill_point_cloud_holes=True)
+    optimization_bundle["pcd_points"] = point_cloud.points_padded()[0].cpu().numpy()
+    optimization_bundle["pcd_colors"] = point_cloud.features_padded()[0].cpu().numpy()
+    return optimization_bundle, point_cloud
+@spaces.GPU(duration=30)
+def supplement_point_cloud(optimization_bundle: Dict, point_cloud: Pointclouds, prompt: str):
+    w, h = optimization_bundle["W"], optimization_bundle["H"]
+    supporting_frames = []
+    for i, frame in enumerate(optimization_bundle["frames"]):
+        # skip supporting views
+        if frame.get("supporting", False):
+            continue
+        center_point = torch.tensor(frame["center_point"]).to(device)
+        mean_depth = frame["mean_depth"]
+        azim, elev = frame["azim"], frame["elev"]
+        azim_jitters = torch.linspace(-5, 5, 3).tolist()
+        elev_jitters = torch.linspace(-5, 5, 3).tolist()
+        # build the product of azim and elev jitters
+        camera_jitters = [{"azim": azim + azim_jitter, "elev": elev + elev_jitter} for azim_jitter, elev_jitter in itertools.product(azim_jitters, elev_jitters)]
+        look_at_params = [(camera_jitter["azim"], camera_jitter["elev"], mean_depth, center_point.unsqueeze(0)) for camera_jitter in camera_jitters]
+        local_supporting_frames, point_cloud = extrapolate_point_cloud(prompt, (w, h), look_at_params, point_cloud, dry_run=True, depth_scaling=0.5, antialiasing=3)
+        for local_supporting_frame in local_supporting_frames:
+            local_supporting_frame["supporting"] = True
+        supporting_frames.extend(local_supporting_frames)
+    optimization_bundle["pcd_points"] = point_cloud.points_padded()[0].cpu().numpy()
+    optimization_bundle["pcd_colors"] = point_cloud.features_padded()[0].cpu().numpy()
+    return optimization_bundle, point_cloud
+@spaces.GPU(duration=30)
+def generate_scene(img: Image.Image, prompt: str):
+    assert isinstance(img, Image.Image)
+    # resize image maintaining the aspect ratio so the longest side is 720 pixels
+    max_size = 720
+    img.thumbnail((max_size, max_size))
+    # crop to ensure the image dimensions are divisible by 8
+    img = img.crop((0, 0, img.width - img.width % 8, img.height - img.height % 8))
+    from hashlib import sha1
+    from datetime import datetime
+    run_id = sha1(datetime.now().isoformat().encode()).hexdigest()[:6]
+    run_name = f"gradio_{run_id}"
+    gs_optimization_bundle, point_cloud = generate_point_cloud(img, prompt)
+    downsampled_point_cloud = downsample_point_cloud(gs_optimization_bundle, device=device)
+    gs_optimization_bundle["pcd_points"] = downsampled_point_cloud.points_padded()[0].cpu().numpy()
+    gs_optimization_bundle["pcd_colors"] = downsampled_point_cloud.features_padded()[0].cpu().numpy()
+    scene = Scene(gs_optimization_bundle, GaussianModel(gs_options.sh_degree), gs_options)
+    scene.gaussians._opacity = torch.ones_like(scene.gaussians._opacity)
+    #scene = run_gaussian_splatting(scene, gs_optimization_bundle)
+    # coordinate system transformation
+    scene.gaussians._xyz = scene.gaussians._xyz.detach()
+    scene.gaussians._xyz[:, 1] = -scene.gaussians._xyz[:, 1]
+    scene.gaussians._xyz[:, 2] = -scene.gaussians._xyz[:, 2]
+    save_path = os.path.join("outputs", f"{run_name}.ply")
+    scene.gaussians.save_ply(save_path)
+    return save_path
+if __name__ == "__main__":
+    global device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    from utils.models import get_zoe_dc_model, get_sd_pipeline
+    global zoe_dc_model
+    from huggingface_hub import hf_hub_download
+    zoe_dc_model = get_zoe_dc_model(ckpt_path=hf_hub_download(repo_id="paulengstler/invisible-stitch", filename="invisible-stitch.pt")).to(device)
+    global pipe
+    pipe = get_sd_pipeline().to(device)
+    demo = gr.Interface(
+        fn=generate_scene,
+        inputs=[
+            gr.Image(label="Input Image", sources=["upload", "clipboard"], type="pil"),
+            gr.Textbox(label="Scene Hallucination Prompt")
+        ],
+        outputs=gr.Model3D(label="Generated Scene"),
+        allow_flagging="never",
+        title="Invisible Stitch: Generating Smooth 3D Scenes with Depth Inpainting",
+        description="Hallucinate geometrically coherent 3D scenes from a single input image in less than 30 seconds.<br /> [Project Page](https://research.paulengstler.com/invisible-stitch) | [GitHub](https://github.com/paulengstler/invisible-stitch) | [Paper](#) <br /><br />To keep this demo snappy, we have limited its functionality. Scenes are generated at a low resolution without densification, supporting views are not inpainted, and we do not optimize the resulting point cloud. Imperfections are to be expected, in particular around object borders. Please allow a couple of seconds for the generated scene to be downloaded (about 40 megabytes).",
+        article="Please consider running this demo locally to obtain high-quality results (see the GitHub repository).<br /><br />Here are some observations we made that might help you to get better results:<ul><li>Use generic prompts that match the surroundings of your input image.</li><li>Ensure that the borders of your input image are free from partially visible objects.</li><li>Keep your prompts simple and avoid adding specific details.</li></ul>",
+        examples=[
+            ["examples/photo-1667788000333-4e36f948de9a.jpeg", "a street with traditional buildings in Kyoto, Japan"],
+            ["examples/photo-1628624747186-a941c476b7ef.jpeg", "a suburban street in North Carolina on a bright, sunny day"],
+            ["examples/photo-1469559845082-95b66baaf023.jpeg", "a view of Zion National Park"],
+            ["examples/photo-1514984879728-be0aff75a6e8.jpeg", "a close-up view of a muddy path in a forest"],
+            ["examples/photo-1618197345638-d2df92b39fe1.jpeg", "a close-up view of a white linen bed in a minimalistic room"],
+            ["examples/photo-1546975490-e8b92a360b24.jpeg", "a warm living room with plants"],
+            ["examples/photo-1499916078039-922301b0eb9b.jpeg", "a cozy bedroom on a bright day"],
+        ])
+    demo.queue().launch(share=True)

examples/photo-1469559845082-95b66baaf023.jpeg ADDED Viewed

examples/photo-1499916078039-922301b0eb9b.jpeg ADDED Viewed

examples/photo-1514984879728-be0aff75a6e8.jpeg ADDED Viewed

examples/photo-1546975490-e8b92a360b24.jpeg ADDED Viewed

examples/photo-1618197345638-d2df92b39fe1.jpeg ADDED Viewed

examples/photo-1628624747186-a941c476b7ef.jpeg ADDED Viewed

examples/photo-1667788000333-4e36f948de9a.jpeg ADDED Viewed

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python3-dev

pre-requirements.txt ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+datasets==2.19.0
+diffusers==0.26.3
+fire==0.5.0
+gradio==4.27.0
+h5py==3.10.0
+huggingface_hub==0.22.2
+imageio==2.33.1
+jaxtyping==0.2.28
+matplotlib==3.7.5
+numpy==1.22.4
+opencv_python==4.8.0.76
+pandas==1.5.1
+Pillow==10.3.0
+plyfile==1.0.3
+scipy==1.8.1
+scikit-image
+submitit==1.5.1
+tqdm==4.66.1
+trimesh==3.21.7
+wandb==0.16.3
+xformers==0.0.25
+spaces
+timm==0.6.7
+transformers==4.37.2
+accelerate==0.27.2
+easydict

utils/demo.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import copy
+import torch
+import numpy as np
+import skimage
+from pytorch3d.renderer import (
+    look_at_view_transform,
+    PerspectiveCameras,
+)
+from .render import render
+from .ops import project_points, get_pointcloud, merge_pointclouds
+def downsample_point_cloud(optimization_bundle, device="cpu"):
+    point_cloud = None
+    for i, frame in enumerate(optimization_bundle["frames"]):
+        if frame.get("supporting", False):
+            continue
+        downsampled_image = copy.deepcopy(frame["image"])
+        downsampled_image.thumbnail((360, 360))
+        image_size = downsampled_image.size
+        w, h = image_size
+        # regenerate the point cloud at a lower resolution
+        R, T = look_at_view_transform(device=device, azim=frame["azim"], elev=frame["elev"], dist=frame["dist"])#, dist=1+0.15*step)
+        cameras = PerspectiveCameras(R=R, T=T, focal_length=torch.tensor([w], dtype=torch.float32), principal_point=(((h-1)/2, (w-1)/2),), image_size=(image_size,), device=device, in_ndc=False)
+        # downsample the depth
+        downsampled_depth = torch.nn.functional.interpolate(torch.tensor(frame["depth"]).unsqueeze(0).unsqueeze(0).float().to(device), size=(h, w), mode="nearest").squeeze()
+        xy_depth_world = project_points(cameras, downsampled_depth)
+        rgb = (torch.from_numpy(np.asarray(downsampled_image).copy()).reshape(-1, 3).float() / 255).to(device)
+        c2w = cameras.get_world_to_view_transform().get_matrix()[0]
+        if i == 0:
+            point_cloud = get_pointcloud(xy_depth_world[0], device=device, features=rgb)
+        else:
+            images, masks, depths = render(cameras, point_cloud, radius=1e-2)
+            # pytorch 3d's mask might be slightly too big (subpixels), so we erode it a little to avoid seams
+            # in theory, 1 pixel is sufficient but we use 2 to be safe
+            masks[0] = torch.from_numpy(skimage.morphology.binary_erosion(masks[0].cpu().numpy(), footprint=skimage.morphology.disk(1))).to(device)
+            partial_outpainted_point_cloud = get_pointcloud(xy_depth_world[0][~masks[0].view(-1)], device=device, features=rgb[~masks[0].view(-1)])
+            point_cloud = merge_pointclouds([point_cloud, partial_outpainted_point_cloud])
+    return point_cloud

utils/gaussian_renderer/__init__.py ADDED Viewed

	@@ -0,0 +1,100 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+import math
+from diff_gaussian_rasterization import GaussianRasterizationSettings, GaussianRasterizer
+from scene.gaussian_model import GaussianModel
+from utils.sh_utils import eval_sh
+def render(viewpoint_camera, pc : GaussianModel, pipe, bg_color : torch.Tensor, scaling_modifier = 1.0, override_color = None):
+    """
+    Render the scene.
+    Background tensor (bg_color) must be on GPU!
+    """
+    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
+    screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0
+    try:
+        screenspace_points.retain_grad()
+    except:
+        pass
+    # Set up rasterization configuration
+    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
+    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)
+    raster_settings = GaussianRasterizationSettings(
+        image_height=int(viewpoint_camera.image_height),
+        image_width=int(viewpoint_camera.image_width),
+        tanfovx=tanfovx,
+        tanfovy=tanfovy,
+        bg=bg_color,
+        scale_modifier=scaling_modifier,
+        viewmatrix=viewpoint_camera.world_view_transform,
+        projmatrix=viewpoint_camera.full_proj_transform,
+        sh_degree=pc.active_sh_degree,
+        campos=viewpoint_camera.camera_center,
+        prefiltered=False,
+        debug=pipe.debug
+    )
+    rasterizer = GaussianRasterizer(raster_settings=raster_settings)
+    means3D = pc.get_xyz
+    means2D = screenspace_points
+    opacity = pc.get_opacity
+    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
+    # scaling / rotation by the rasterizer.
+    scales = None
+    rotations = None
+    cov3D_precomp = None
+    if pipe.compute_cov3D_python:
+        cov3D_precomp = pc.get_covariance(scaling_modifier)
+    else:
+        scales = pc.get_scaling
+        rotations = pc.get_rotation
+    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
+    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
+    shs = None
+    colors_precomp = None
+    if override_color is None:
+        if pipe.convert_SHs_python:
+            shs_view = pc.get_features.transpose(1, 2).view(-1, 3, (pc.max_sh_degree+1)**2)
+            dir_pp = (pc.get_xyz - viewpoint_camera.camera_center.repeat(pc.get_features.shape[0], 1))
+            dir_pp_normalized = dir_pp/dir_pp.norm(dim=1, keepdim=True)
+            sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized)
+            colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
+        else:
+            shs = pc.get_features
+    else:
+        colors_precomp = override_color
+    # Rasterize visible Gaussians to image, obtain their radii (on screen).
+    rendered_image, radii = rasterizer(
+        means3D = means3D,
+        means2D = means2D,
+        shs = shs,
+        colors_precomp = colors_precomp,
+        opacities = opacity,
+        scales = scales,
+        rotations = rotations,
+        cov3D_precomp = cov3D_precomp)
+    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
+    # They will be excluded from value updates used in the splitting criteria.
+    return {"render": rendered_image,
+            "viewspace_points": screenspace_points,
+            "visibility_filter" : radii > 0,
+            "radii": radii}

utils/gaussian_renderer/network_gui.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+import traceback
+import socket
+import json
+from scene.cameras import MiniCam
+host = "127.0.0.1"
+port = 6009
+conn = None
+addr = None
+listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+def init(wish_host, wish_port):
+    global host, port, listener
+    host = wish_host
+    port = wish_port
+    listener.bind((host, port))
+    listener.listen()
+    listener.settimeout(0)
+def try_connect():
+    global conn, addr, listener
+    try:
+        conn, addr = listener.accept()
+        print(f"\nConnected by {addr}")
+        conn.settimeout(None)
+    except Exception as inst:
+        pass
+def read():
+    global conn
+    messageLength = conn.recv(4)
+    messageLength = int.from_bytes(messageLength, 'little')
+    message = conn.recv(messageLength)
+    return json.loads(message.decode("utf-8"))
+def send(message_bytes, verify):
+    global conn
+    if message_bytes != None:
+        conn.sendall(message_bytes)
+    conn.sendall(len(verify).to_bytes(4, 'little'))
+    conn.sendall(bytes(verify, 'ascii'))
+def receive():
+    message = read()
+    width = message["resolution_x"]
+    height = message["resolution_y"]
+    if width != 0 and height != 0:
+        try:
+            do_training = bool(message["train"])
+            fovy = message["fov_y"]
+            fovx = message["fov_x"]
+            znear = message["z_near"]
+            zfar = message["z_far"]
+            do_shs_python = bool(message["shs_python"])
+            do_rot_scale_python = bool(message["rot_scale_python"])
+            keep_alive = bool(message["keep_alive"])
+            scaling_modifier = message["scaling_modifier"]
+            world_view_transform = torch.reshape(torch.tensor(message["view_matrix"]), (4, 4)).cuda()
+            world_view_transform[:,1] = -world_view_transform[:,1]
+            world_view_transform[:,2] = -world_view_transform[:,2]
+            full_proj_transform = torch.reshape(torch.tensor(message["view_projection_matrix"]), (4, 4)).cuda()
+            full_proj_transform[:,1] = -full_proj_transform[:,1]
+            custom_cam = MiniCam(width, height, fovy, fovx, znear, zfar, world_view_transform, full_proj_transform)
+        except Exception as e:
+            print("")
+            traceback.print_exc()
+            raise e
+        return custom_cam, do_training, do_shs_python, do_rot_scale_python, keep_alive, scaling_modifier
+    else:
+        return None, None, None, None, None, None

utils/gs.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import random
+import torch
+import numpy as np
+from .scene import GaussianModel
+from .scene.dataset_readers import SceneInfo, getNerfppNorm
+from .scene.cameras import Camera
+from .ops import focal2fov, fov2focal
+from .scene.gaussian_model import BasicPointCloud
+from easydict import EasyDict as edict
+from PIL import Image
+from tqdm.auto import tqdm
+def get_blank_gs_bundle(h, w):
+    return {
+        "camera_angle_x": focal2fov(torch.tensor([w], dtype=torch.float32), w),
+        "W": w,
+        "H": h,
+        "pcd_points": None,
+        "pcd_colors": None,
+        'frames': [],
+    }
+def read_cameras_from_optimization_bundle(optimization_bundle, white_background: bool = False):
+    cameras = []
+    fovx = optimization_bundle["camera_angle_x"]
+    frames = optimization_bundle["frames"]
+    # we flip the x and y axis to move from PyTorch3D's coordinate system to COLMAP's
+    coordinate_system_transform = np.array([-1, -1, 1])
+    for idx, frame in enumerate(frames):
+        c2w = np.array(frame["transform_matrix"])
+        c2w[:3, :3] = c2w[:3, :3] * coordinate_system_transform
+        # get the world-to-camera transform and set R, T
+        w2c = np.linalg.inv(c2w)
+        R = np.transpose(w2c[:3, :3])  # R is stored transposed due to 'glm' in CUDA code
+        T = c2w[-1, :3] * coordinate_system_transform
+        image = frame["image"]
+        im_data = np.array(image.convert("RGBA"))
+        bg = np.array([1,1,1]) if white_background else np.array([0, 0, 0])
+        norm_data = im_data / 255.0
+        arr = norm_data[:,:,:3] * norm_data[:, :, 3:4] + bg * (1 - norm_data[:, :, 3:4])
+        image = Image.fromarray(np.array(arr*255.0, dtype=np.byte), "RGB")
+        fovy = focal2fov(fov2focal(fovx, image.size[0]), image.size[1])
+        FovY = fovy
+        FovX = fovx
+        image = torch.Tensor(arr).permute(2,0,1)
+        cameras.append(Camera(colmap_id=idx, R=R, T=T, FoVx=FovX, FoVy=FovY, image=image, mask=frame.get("mask", None),
+                                gt_alpha_mask=None, image_name='', uid=idx, data_device='cuda'))
+    return cameras
+class Scene:
+    gaussians: GaussianModel
+    def __init__(self, traindata, gaussians: GaussianModel, gs_options, shuffle: bool = True):
+        self.traindata = traindata
+        self.gaussians = gaussians
+        train_cameras = read_cameras_from_optimization_bundle(traindata, gs_options.white_background)
+        nerf_normalization = getNerfppNorm(train_cameras)
+        pcd = BasicPointCloud(points=traindata['pcd_points'], colors=traindata['pcd_colors'], normals=None)
+        scene_info = SceneInfo(point_cloud=pcd,
+                               train_cameras=train_cameras,
+                               test_cameras=[],
+                               nerf_normalization=nerf_normalization,
+                               ply_path='')
+        if shuffle:
+            random.shuffle(scene_info.train_cameras)  # Multi-res consistent random shuffling
+        self.cameras_extent = scene_info.nerf_normalization["radius"]
+        self.train_cameras = scene_info.train_cameras
+        bg_color = np.array([1,1,1]) if gs_options.white_background else np.array([0, 0, 0])
+        self.background = torch.tensor(bg_color, dtype=torch.float32, device='cuda')
+        self.gaussians.create_from_pcd(scene_info.point_cloud, self.cameras_extent)
+        self.gaussians.training_setup(gs_options)
+    def getTrainCameras(self):
+        return self.train_cameras
+    def getPresetCameras(self, preset):
+        assert preset in self.preset_cameras
+        return self.preset_cameras[preset]
+def run_gaussian_splatting(scene, gs_optimization_bundle):
+    torch.cuda.empty_cache()
+    return scene
+    from random import randint
+    from .gaussian_renderer import render as gs_render
+    from .scene.utils.loss_utils import l1_loss, ssim
+    pbar = tqdm(range(1, gs_options.iterations + 1))
+    for iteration in pbar:
+        scene.gaussians.update_learning_rate(iteration)
+        # Every 1000 its we increase the levels of SH up to a maximum degree
+        if iteration % 1000 == 0:
+            scene.gaussians.oneupSHdegree()
+        # Pick a random Camera
+        random_idx = randint(0, len(gs_optimization_bundle["frames"])-1)
+        viewpoint_cam = scene.getTrainCameras()[random_idx]
+        # Render
+        render_pkg = gs_render(viewpoint_cam, scene.gaussians, gs_options, scene.background)
+        image, viewspace_point_tensor, visibility_filter, radii = (
+            render_pkg['render'], render_pkg['viewspace_points'], render_pkg['visibility_filter'], render_pkg['radii'])
+        # Loss
+        gt_image = viewpoint_cam.original_image.cuda()
+        Ll1 = l1_loss(image, gt_image, reduce=False)
+        loss = (1.0 - gs_options.lambda_dssim) * Ll1
+        if viewpoint_cam.mask is not None:
+            mask = torch.from_numpy(viewpoint_cam.mask).to(loss.device)
+        else:
+            mask = 1
+        loss = (loss * mask).mean()
+        loss = loss + gs_options.lambda_dssim * (1.0 - ssim(image, gt_image))
+        loss.backward()
+        pbar.set_description(f"Loss: {loss.item():.4f}")
+        with torch.no_grad():
+            # Densification
+            if iteration < gs_options.densify_until_iter:
+                # Keep track of max radii in image-space for pruning
+                scene.gaussians.max_radii2D[visibility_filter] = torch.max(
+                    scene.gaussians.max_radii2D[visibility_filter], radii[visibility_filter])
+                scene.gaussians.add_densification_stats(viewspace_point_tensor, visibility_filter)
+                if iteration > gs_options.densify_from_iter and iteration % gs_options.densification_interval == 0:
+                    size_threshold = 20 if iteration > gs_options.opacity_reset_interval else None
+                    scene.gaussians.densify_and_prune(
+                        gs_options.densify_grad_threshold, 0.005, scene.cameras_extent, size_threshold)
+                if (iteration % gs_options.opacity_reset_interval == 0
+                    or (gs_options.white_background and iteration == gs_options.densify_from_iter)
+                ):
+                    scene.gaussians.reset_opacity()
+            # Optimizer step
+            if iteration < gs_options.iterations:
+                scene.gaussians.optimizer.step()
+                scene.gaussians.optimizer.zero_grad(set_to_none = True)
+    return scene
+gs_options = edict({
+    "sh_degree": 3,
+    "images": "images",
+    "resolution": -1,
+    "white_background": False,
+    "data_device": "cuda",
+    "eval": False,
+    "use_depth": False,
+    "iterations": 0,#250,
+    "position_lr_init": 0.00016,
+    "position_lr_final": 0.0000016,
+    "position_lr_delay_mult": 0.01,
+    "position_lr_max_steps": 2990,
+    "feature_lr": 0.0,#0.0025,
+    "opacity_lr": 0.0,#0.05,
+    "scaling_lr": 0.0,#0.005,
+    "rotation_lr": 0.0,#0.001,
+    "percent_dense": 0.01,
+    "lambda_dssim": 0.2,
+    "densification_interval": 100,
+    "opacity_reset_interval": 3000,
+    "densify_from_iter": 10_000,
+    "densify_until_iter": 15_000,
+    "densify_grad_threshold": 0.0002,
+    "convert_SHs_python": False,
+    "compute_cov3D_python": False,
+    "debug": False,
+})

utils/models.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import glob
+import os
+import torch
+import torch.nn.functional as F
+import numpy as np
+from zoedepth.utils.misc import colorize
+from zoedepth.utils.config import get_config
+from zoedepth.models.builder import build_model
+from zoedepth.models.model_io import load_wts
+from diffusers import AsymmetricAutoencoderKL, StableDiffusionInpaintPipeline
+def load_ckpt(config, model, checkpoint_dir: str = "./checkpoints", ckpt_type: str = "best"):
+    if hasattr(config, "checkpoint"):
+        checkpoint = config.checkpoint
+    elif hasattr(config, "ckpt_pattern"):
+        pattern = config.ckpt_pattern
+        matches = glob.glob(os.path.join(
+            checkpoint_dir, f"*{pattern}*{ckpt_type}*"))
+        if not (len(matches) > 0):
+            raise ValueError(f"No matches found for the pattern {pattern}")
+        checkpoint = matches[0]
+    else:
+        return model
+    model = load_wts(model, checkpoint)
+    print("Loaded weights from {0}".format(checkpoint))
+    return model
+def get_zoe_dc_model(vanilla: bool = False, ckpt_path: str = None, **kwargs):
+    def ZoeD_N(midas_model_type="DPT_BEiT_L_384", vanilla=False, **kwargs):
+        if midas_model_type != "DPT_BEiT_L_384":
+            raise ValueError(f"Only DPT_BEiT_L_384 MiDaS model is supported for pretrained Zoe_N model, got: {midas_model_type}")
+        zoedepth_config = get_config("zoedepth", "train", **kwargs)
+        model = build_model(zoedepth_config)
+        if vanilla:
+            model.__setattr__("vanilla", True)
+            return model
+        else:
+            model.__setattr__("vanilla", False)
+        if zoedepth_config.add_depth_channel and not vanilla:
+            model.core.core.pretrained.model.patch_embed.proj = torch.nn.Conv2d(
+                model.core.core.pretrained.model.patch_embed.proj.in_channels+2,
+                model.core.core.pretrained.model.patch_embed.proj.out_channels,
+                kernel_size=model.core.core.pretrained.model.patch_embed.proj.kernel_size,
+                stride=model.core.core.pretrained.model.patch_embed.proj.stride,
+                padding=model.core.core.pretrained.model.patch_embed.proj.padding,
+                bias=True)
+        if ckpt_path is not None:
+            assert os.path.exists(ckpt_path)
+            zoedepth_config.__setattr__("checkpoint", ckpt_path)
+        else:
+            assert vanilla, "ckpt_path must be provided for non-vanilla model"
+        model = load_ckpt(zoedepth_config, model)
+        return model
+    return ZoeD_N(vanilla=vanilla, ckpt_path=ckpt_path, **kwargs)
+def infer_with_pad(zoe, x, pad_input: bool = True, fh: float = 3, fw: float = 3, upsampling_mode: str = "bicubic", padding_mode: str = "reflect", **kwargs):
+    assert x.dim() == 4, "x must be 4 dimensional, got {}".format(x.dim())
+    if pad_input:
+        assert fh > 0 or fw > 0, "atlease one of fh and fw must be greater than 0"
+        pad_h = int(np.sqrt(x.shape[2]/2) * fh)
+        pad_w = int(np.sqrt(x.shape[3]/2) * fw)
+        padding = [pad_w, pad_w]
+        if pad_h > 0:
+            padding += [pad_h, pad_h]
+        x_rgb = x[:, :3]
+        x_remaining = x[:, 3:]
+        x_rgb = F.pad(x_rgb, padding, mode=padding_mode, **kwargs)
+        x_remaining = F.pad(x_remaining, padding, mode="constant", value=0, **kwargs)
+        x = torch.cat([x_rgb, x_remaining], dim=1)
+    out = zoe(x)["metric_depth"]
+    if out.shape[-2:] != x.shape[-2:]:
+        out = F.interpolate(out, size=(x.shape[2], x.shape[3]), mode=upsampling_mode, align_corners=False)
+    if pad_input:
+        # crop to the original size, handling the case where pad_h and pad_w is 0
+        if pad_h > 0:
+            out = out[:, :, pad_h:-pad_h,:]
+        if pad_w > 0:
+            out = out[:, :, :, pad_w:-pad_w]
+    return out
+@torch.no_grad()
+def infer_with_zoe_dc(zoe_dc, image, sparse_depth, scaling: float = 1):
+    sparse_depth_mask = (sparse_depth[None, None, ...] > 0).float()
+    # the metric depth range defined during training is [1e-3, 10]
+    x = torch.cat([image[None, ...], sparse_depth[None, None, ...] / (float(scaling) * 10.0), sparse_depth_mask], dim=1).to(zoe_dc.device)
+    out = infer_with_pad(zoe_dc, x)
+    out_flip = infer_with_pad(zoe_dc, torch.flip(x, dims=[3]))
+    out = (out + torch.flip(out_flip, dims=[3])) / 2
+    pred_depth = float(scaling) * out
+    return torch.nn.functional.interpolate(pred_depth, image.shape[-2:], mode='bilinear', align_corners=True)[0, 0]
+def get_sd_pipeline():
+    pipe = StableDiffusionInpaintPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-2-inpainting",
+        torch_dtype=torch.float16,
+    )
+    pipe.vae = AsymmetricAutoencoderKL.from_pretrained(
+        "cross-attention/asymmetric-autoencoder-kl-x-2",
+        torch_dtype=torch.float16
+    )
+    return pipe

utils/ops.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import numpy as np
+import torch
+import skimage
+from scipy import ndimage
+from PIL import Image
+from .models import infer_with_zoe_dc
+from pytorch3d.structures import Pointclouds
+import math
+def nearest_neighbor_fill(img, mask, erosion=0):
+    img_ = np.copy(img.cpu().numpy())
+    if erosion > 0:
+        eroded_mask = skimage.morphology.binary_erosion(mask.cpu().numpy(), footprint=skimage.morphology.disk(erosion))
+    else:
+        eroded_mask = mask.cpu().numpy()
+    img_[eroded_mask <= 0] = np.nan
+    distance_to_boundary = ndimage.distance_transform_bf((~eroded_mask>0), metric="cityblock")
+    for current_dist in np.unique(distance_to_boundary)[1:]:
+        ii, jj = np.where(distance_to_boundary == current_dist)
+        ii_ = np.array([ii - 1, ii, ii + 1, ii - 1, ii, ii + 1, ii - 1, ii, ii + 1]).reshape(9, -1)
+        jj_ = np.array([jj - 1, jj - 1, jj - 1, jj, jj, jj, jj + 1, jj + 1, jj + 1]).reshape(9, -1)
+        ii_ = ii_.clip(0, img_.shape[0] - 1)
+        jj_ = jj_.clip(0, img_.shape[1] - 1)
+        img_[ii, jj] = np.nanmax(img_[ii_, jj_], axis=0)
+    return torch.from_numpy(img_).to(img.device)
+def snap_high_gradients_to_nn(depth, threshold=20):
+    grad_depth = np.copy(depth.cpu().numpy())
+    grad_depth = grad_depth - grad_depth.min()
+    grad_depth = grad_depth / grad_depth.max()
+    grad = skimage.filters.rank.gradient(grad_depth, skimage.morphology.disk(1))
+    return nearest_neighbor_fill(depth, torch.from_numpy(grad < threshold), erosion=3)
+def project_points(cameras, depth, use_pixel_centers=True):
+    if len(cameras) > 1:
+        import warnings
+        warnings.warn("project_points assumes only a single camera is used")
+    depth_t = torch.from_numpy(depth) if isinstance(depth, np.ndarray) else depth
+    depth_t = depth_t.to(cameras.device)
+    pixel_center = 0.5 if use_pixel_centers else 0
+    fx, fy = cameras.focal_length[0, 1], cameras.focal_length[0, 0]
+    cx, cy = cameras.principal_point[0, 1], cameras.principal_point[0, 0]
+    i, j = torch.meshgrid(
+        torch.arange(cameras.image_size[0][0], dtype=torch.float32, device=cameras.device) + pixel_center,
+        torch.arange(cameras.image_size[0][1], dtype=torch.float32, device=cameras.device) + pixel_center,
+        indexing="xy",
+    )
+    directions = torch.stack(
+        [-(i - cx) * depth_t / fx, -(j - cy) * depth_t / fy, depth_t], -1
+    )
+    xy_depth_world = cameras.get_world_to_view_transform().inverse().transform_points(directions.view(-1, 3)).unsqueeze(0)
+    return xy_depth_world
+def get_pointcloud(xy_depth_world, device="cpu", features=None):
+    point_cloud = Pointclouds(points=[xy_depth_world.to(device)], features=[features] if features is not None else None)
+    return point_cloud
+def merge_pointclouds(point_clouds):
+    points = torch.cat([pc.points_padded() for pc in point_clouds], dim=1)
+    features = torch.cat([pc.features_padded() for pc in point_clouds], dim=1)
+    return Pointclouds(points=[points[0]], features=[features[0]])
+def outpaint_with_depth_estimation(image, mask, previous_depth, h, w, pipe, zoe_dc, prompt, cameras, dilation_size: int = 2, depth_scaling: float = 1, generator = None):
+    img_input = Image.fromarray((255*image[..., :3].cpu().numpy()).astype(np.uint8))
+    # we slightly dilate the mask as aliasing might cause us to receive a too small mask from pytorch3d
+    img_mask = Image.fromarray((255*skimage.morphology.isotropic_dilation(((~mask).cpu().numpy()), radius=dilation_size)).astype(np.uint8))#footprint=skimage.morphology.disk(dilation_size)))
+    out_image = pipe(prompt=prompt, image=img_input, mask_image=img_mask, height=h, width=w, generator=generator).images[0]
+    out_depth = infer_with_zoe_dc(zoe_dc, torch.from_numpy(np.asarray(out_image)/255.).permute(2,0,1).float().to(zoe_dc.device), (previous_depth * mask).to(zoe_dc.device), scaling=depth_scaling).cpu().numpy()
+    return out_image, out_depth
+def fov2focal(fov, pixels):
+    return pixels / (2 * math.tan(fov / 2))
+def focal2fov(focal, pixels):
+    return 2*math.atan(pixels/(2*focal))

utils/render.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+import skimage
+from pytorch3d.structures import Pointclouds
+from pytorch3d.renderer import (
+    look_at_view_transform,
+    FoVOrthographicCameras,
+    FoVPerspectiveCameras,
+    PerspectiveCameras,
+    PointsRasterizationSettings,
+    PointsRenderer,
+    PulsarPointsRenderer,
+    PointsRasterizer,
+    AlphaCompositor,
+    NormWeightedCompositor
+)
+from .ops import nearest_neighbor_fill
+from typing import cast, Optional
+class PointsRendererWithMasks(PointsRenderer):
+    def forward(self, point_clouds, **kwargs) -> torch.Tensor:
+        fragments = self.rasterizer(point_clouds, **kwargs)
+        # Construct weights based on the distance of a point to the true point.
+        # However, this could be done differently: e.g. predicted as opposed
+        # to a function of the weights.
+        r = self.rasterizer.raster_settings.radius
+        dists2 = fragments.dists
+        weights = torch.ones_like(dists2)#1 - dists2 / (r * r)
+        ok = cast(torch.BoolTensor, (fragments.idx >= 0)).float()
+        weights = weights * ok
+        fragments_prm = fragments.idx.long().permute(0, 3, 1, 2)
+        weights_prm = weights.permute(0, 3, 1, 2)
+        images = self.compositor(
+            fragments_prm,
+            weights_prm,
+            point_clouds.features_packed().permute(1, 0),
+            **kwargs,
+        )
+        cumprod = torch.cumprod(1 - weights, dim=-1)
+        cumprod = torch.cat((torch.ones_like(cumprod[..., :1]), cumprod[..., :-1]), dim=-1)
+        depths = (weights * cumprod * fragments.zbuf).sum(dim=-1)
+        # permute so image comes at the end
+        images = images.permute(0, 2, 3, 1)
+        masks = fragments.idx.long()[..., 0] >= 0
+        return images, masks, depths
+def render_with_settings(cameras, point_cloud, raster_settings, antialiasing: int = 1):
+    if antialiasing > 1:
+        raster_settings.image_size = (raster_settings.image_size[0] * antialiasing, raster_settings.image_size[1] * antialiasing)
+    rasterizer = PointsRasterizer(cameras=cameras, raster_settings=raster_settings)
+    renderer = PointsRendererWithMasks(
+        rasterizer=rasterizer,
+        compositor=AlphaCompositor()
+    )
+    if antialiasing > 1:
+        images, masks, depths = renderer(point_cloud)
+        images = images.permute(0, 3, 1, 2)  # NHWC -> NCHW
+        images = F.avg_pool2d(images, kernel_size=antialiasing, stride=antialiasing)
+        images = images.permute(0, 2, 3, 1)  # NCHW -> NHWC
+    else:
+        return renderer(point_cloud)
+def render(cameras, point_cloud, fill_point_cloud_holes: bool = False, radius: Optional[float] = None, antialiasing: int = 1):
+    if fill_point_cloud_holes:
+        coarse_raster_settings = PointsRasterizationSettings(
+            image_size=(int(cameras.image_size[0, 1]), int(cameras.image_size[0, 0])),
+            radius = 1e-2,
+            points_per_pixel = 1
+        )
+        _, coarse_mask, _ = render_with_settings(cameras, point_cloud, coarse_raster_settings)
+        eroded_coarse_mask = torch.from_numpy(skimage.morphology.binary_erosion(coarse_mask[0].cpu().numpy(), footprint=skimage.morphology.disk(2)))
+        raster_settings = PointsRasterizationSettings(
+            image_size=(int(cameras.image_size[0, 1]), int(cameras.image_size[0, 0])),
+            radius = (1 / float(max(cameras.image_size[0, 1], cameras.image_size[0, 0])) * 2.0) if radius is None else radius,
+            points_per_pixel = 16
+        )
+        # Render the scene
+        images, masks, depths = render_with_settings(cameras, point_cloud, raster_settings)
+        holes_in_rendering = masks[0].cpu() ^ eroded_coarse_mask
+        images[0] = nearest_neighbor_fill(images[0], ~holes_in_rendering, 0)
+        depths[0] = nearest_neighbor_fill(depths[0], ~holes_in_rendering, 0)
+        return images, eroded_coarse_mask.unsqueeze(0).to(masks.device), depths
+    else:
+        raster_settings = PointsRasterizationSettings(
+            image_size=(int(cameras.image_size[0, 1]), int(cameras.image_size[0, 0])),
+            radius = (1 / float(max(cameras.image_size[0, 1], cameras.image_size[0, 0])) * 2.0) if radius is None else radius,
+            points_per_pixel = 16
+        )
+        # Render the scene
+        return render_with_settings(cameras, point_cloud, raster_settings)

utils/scene/__init__.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import os
+import random
+import json
+from .utils.system_utils import searchForMaxIteration
+from .dataset_readers import sceneLoadTypeCallbacks
+from .gaussian_model import GaussianModel
+from .utils.camera_utils import cameraList_from_camInfos, camera_to_JSON
+class Scene:
+    gaussians : GaussianModel
+    def __init__(self, args, gaussians : GaussianModel, load_iteration=None, shuffle=True, resolution_scales=[1.0]):
+        """b
+        :param path: Path to colmap scene main folder.
+        """
+        self.model_path = args.model_path
+        self.loaded_iter = None
+        self.gaussians = gaussians
+        if load_iteration:
+            if load_iteration == -1:
+                self.loaded_iter = searchForMaxIteration(os.path.join(self.model_path, "point_cloud"))
+            else:
+                self.loaded_iter = load_iteration
+            print("Loading trained model at iteration {}".format(self.loaded_iter))
+        self.train_cameras = {}
+        self.test_cameras = {}
+        if os.path.exists(os.path.join(args.source_path, "sparse")):
+            scene_info = sceneLoadTypeCallbacks["Colmap"](args.source_path, args.images, args.eval)
+        elif os.path.exists(os.path.join(args.source_path, "transforms_train.json")):
+            print("Found transforms_train.json file, assuming Blender data set!")
+            scene_info = sceneLoadTypeCallbacks["Blender"](args.source_path, args.white_background, args.eval)
+        else:
+            assert False, "Could not recognize scene type!"
+        if not self.loaded_iter:
+            with open(scene_info.ply_path, 'rb') as src_file, open(os.path.join(self.model_path, "input.ply") , 'wb') as dest_file:
+                dest_file.write(src_file.read())
+            json_cams = []
+            camlist = []
+            if scene_info.test_cameras:
+                camlist.extend(scene_info.test_cameras)
+            if scene_info.train_cameras:
+                camlist.extend(scene_info.train_cameras)
+            for id, cam in enumerate(camlist):
+                json_cams.append(camera_to_JSON(id, cam))
+            with open(os.path.join(self.model_path, "cameras.json"), 'w') as file:
+                json.dump(json_cams, file)
+        if shuffle:
+            random.shuffle(scene_info.train_cameras)  # Multi-res consistent random shuffling
+            random.shuffle(scene_info.test_cameras)  # Multi-res consistent random shuffling
+        self.cameras_extent = scene_info.nerf_normalization["radius"]
+        for resolution_scale in resolution_scales:
+            print("Loading Training Cameras")
+            self.train_cameras[resolution_scale] = cameraList_from_camInfos(scene_info.train_cameras, resolution_scale, args)
+            print("Loading Test Cameras")
+            self.test_cameras[resolution_scale] = cameraList_from_camInfos(scene_info.test_cameras, resolution_scale, args)
+        if self.loaded_iter:
+            self.gaussians.load_ply(os.path.join(self.model_path,
+                                                           "point_cloud",
+                                                           "iteration_" + str(self.loaded_iter),
+                                                           "point_cloud.ply"))
+        else:
+            self.gaussians.create_from_pcd(scene_info.point_cloud, self.cameras_extent)
+    def save(self, iteration):
+        point_cloud_path = os.path.join(self.model_path, "point_cloud/iteration_{}".format(iteration))
+        self.gaussians.save_ply(os.path.join(point_cloud_path, "point_cloud.ply"))
+    def getTrainCameras(self, scale=1.0):
+        return self.train_cameras[scale]
+    def getTestCameras(self, scale=1.0):
+        return self.test_cameras[scale]

utils/scene/cameras.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+import math
+from torch import nn
+import numpy as np
+from .utils.graphics_utils import getWorld2View2, getProjectionMatrix
+class Camera(nn.Module):
+    def __init__(self, colmap_id, R, T, FoVx, FoVy, image, gt_alpha_mask,
+                 image_name, uid, crop_box=None, mask=None,
+                 trans=np.array([0.0, 0.0, 0.0]), scale=1.0, data_device = "cuda"
+                 ):
+        super(Camera, self).__init__()
+        self.uid = uid
+        self.colmap_id = colmap_id
+        self.R = R
+        self.T = T
+        self.FoVx = FoVx
+        self.FoVy = FoVy
+        self.image_name = image_name
+        self.crop_box = crop_box
+        self.mask = mask
+        try:
+            self.data_device = torch.device(data_device)
+        except Exception as e:
+            print(e)
+            print(f"[Warning] Custom device {data_device} failed, fallback to default cuda device" )
+            self.data_device = torch.device("cuda")
+        self.original_image = image.clamp(0.0, 1.0).to(self.data_device)
+        self.image_width = self.original_image.shape[2]
+        self.image_height = self.original_image.shape[1]
+        self.gt_alpha_mask = gt_alpha_mask
+        #if gt_alpha_mask is not None:
+        #    self.original_image *= gt_alpha_mask.to(self.data_device)
+        #else:
+        #    self.original_image *= torch.ones((1, self.image_height, self.image_width), device=self.data_device)
+        self.zfar = 100.0
+        self.znear = 0.01
+        self.trans = trans
+        self.scale = scale
+        self.world_view_transform = torch.tensor(getWorld2View2(R, T, trans, scale)).transpose(0, 1).cuda()
+        self.projection_matrix = getProjectionMatrix(znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy, crop_box=self.crop_box, width=self.image_width, height=self.image_height).transpose(0,1).cuda()
+        self.full_proj_transform = (self.world_view_transform.unsqueeze(0).bmm(self.projection_matrix.unsqueeze(0))).squeeze(0)
+        self.camera_center = self.world_view_transform.inverse()[3, :3]
+class MiniCam:
+    def __init__(self, width, height, fovy, fovx, znear, zfar, world_view_transform, full_proj_transform):
+        self.image_width = width
+        self.image_height = height
+        self.FoVy = fovy
+        self.FoVx = fovx
+        self.znear = znear
+        self.zfar = zfar
+        self.world_view_transform = world_view_transform
+        self.full_proj_transform = full_proj_transform
+        view_inv = torch.inverse(self.world_view_transform)
+        self.camera_center = view_inv[3][:3]

utils/scene/colmap_loader.py ADDED Viewed

	@@ -0,0 +1,294 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import numpy as np
+import collections
+import struct
+CameraModel = collections.namedtuple(
+    "CameraModel", ["model_id", "model_name", "num_params"])
+Camera = collections.namedtuple(
+    "Camera", ["id", "model", "width", "height", "params"])
+BaseImage = collections.namedtuple(
+    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"])
+Point3D = collections.namedtuple(
+    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"])
+CAMERA_MODELS = {
+    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
+    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
+    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
+    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
+    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
+    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
+    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
+    CameraModel(model_id=7, model_name="FOV", num_params=5),
+    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
+    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
+    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12)
+}
+CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
+                         for camera_model in CAMERA_MODELS])
+CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model)
+                           for camera_model in CAMERA_MODELS])
+def qvec2rotmat(qvec):
+    return np.array([
+        [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
+         2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
+         2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],
+        [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
+         1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
+         2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],
+        [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
+         2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
+         1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]])
+def rotmat2qvec(R):
+    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
+    K = np.array([
+        [Rxx - Ryy - Rzz, 0, 0, 0],
+        [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
+        [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
+        [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0
+    eigvals, eigvecs = np.linalg.eigh(K)
+    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
+    if qvec[0] < 0:
+        qvec *= -1
+    return qvec
+class Image(BaseImage):
+    def qvec2rotmat(self):
+        return qvec2rotmat(self.qvec)
+def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
+    """Read and unpack the next bytes from a binary file.
+    :param fid:
+    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    :param endian_character: Any of {@, =, <, >, !}
+    :return: Tuple of read and unpacked values.
+    """
+    data = fid.read(num_bytes)
+    return struct.unpack(endian_character + format_char_sequence, data)
+def read_points3D_text(path):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DText(const std::string& path)
+        void Reconstruction::WritePoints3DText(const std::string& path)
+    """
+    xyzs = None
+    rgbs = None
+    errors = None
+    num_points = 0
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                num_points += 1
+    xyzs = np.empty((num_points, 3))
+    rgbs = np.empty((num_points, 3))
+    errors = np.empty((num_points, 1))
+    count = 0
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                xyz = np.array(tuple(map(float, elems[1:4])))
+                rgb = np.array(tuple(map(int, elems[4:7])))
+                error = np.array(float(elems[7]))
+                xyzs[count] = xyz
+                rgbs[count] = rgb
+                errors[count] = error
+                count += 1
+    return xyzs, rgbs, errors
+def read_points3D_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadPoints3DBinary(const std::string& path)
+        void Reconstruction::WritePoints3DBinary(const std::string& path)
+    """
+    with open(path_to_model_file, "rb") as fid:
+        num_points = read_next_bytes(fid, 8, "Q")[0]
+        xyzs = np.empty((num_points, 3))
+        rgbs = np.empty((num_points, 3))
+        errors = np.empty((num_points, 1))
+        for p_id in range(num_points):
+            binary_point_line_properties = read_next_bytes(
+                fid, num_bytes=43, format_char_sequence="QdddBBBd")
+            xyz = np.array(binary_point_line_properties[1:4])
+            rgb = np.array(binary_point_line_properties[4:7])
+            error = np.array(binary_point_line_properties[7])
+            track_length = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence="Q")[0]
+            track_elems = read_next_bytes(
+                fid, num_bytes=8*track_length,
+                format_char_sequence="ii"*track_length)
+            xyzs[p_id] = xyz
+            rgbs[p_id] = rgb
+            errors[p_id] = error
+    return xyzs, rgbs, errors
+def read_intrinsics_text(path):
+    """
+    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
+    """
+    cameras = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                camera_id = int(elems[0])
+                model = elems[1]
+                assert model == "PINHOLE", "While the loader support other types, the rest of the code assumes PINHOLE"
+                width = int(elems[2])
+                height = int(elems[3])
+                params = np.array(tuple(map(float, elems[4:])))
+                cameras[camera_id] = Camera(id=camera_id, model=model,
+                                            width=width, height=height,
+                                            params=params)
+    return cameras
+def read_extrinsics_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::ReadImagesBinary(const std::string& path)
+        void Reconstruction::WriteImagesBinary(const std::string& path)
+    """
+    images = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
+        for _ in range(num_reg_images):
+            binary_image_properties = read_next_bytes(
+                fid, num_bytes=64, format_char_sequence="idddddddi")
+            image_id = binary_image_properties[0]
+            qvec = np.array(binary_image_properties[1:5])
+            tvec = np.array(binary_image_properties[5:8])
+            camera_id = binary_image_properties[8]
+            image_name = ""
+            current_char = read_next_bytes(fid, 1, "c")[0]
+            while current_char != b"\x00":   # look for the ASCII 0 entry
+                image_name += current_char.decode("utf-8")
+                current_char = read_next_bytes(fid, 1, "c")[0]
+            num_points2D = read_next_bytes(fid, num_bytes=8,
+                                           format_char_sequence="Q")[0]
+            x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D,
+                                       format_char_sequence="ddq"*num_points2D)
+            xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
+                                   tuple(map(float, x_y_id_s[1::3]))])
+            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
+            images[image_id] = Image(
+                id=image_id, qvec=qvec, tvec=tvec,
+                camera_id=camera_id, name=image_name,
+                xys=xys, point3D_ids=point3D_ids)
+    return images
+def read_intrinsics_binary(path_to_model_file):
+    """
+    see: src/base/reconstruction.cc
+        void Reconstruction::WriteCamerasBinary(const std::string& path)
+        void Reconstruction::ReadCamerasBinary(const std::string& path)
+    """
+    cameras = {}
+    with open(path_to_model_file, "rb") as fid:
+        num_cameras = read_next_bytes(fid, 8, "Q")[0]
+        for _ in range(num_cameras):
+            camera_properties = read_next_bytes(
+                fid, num_bytes=24, format_char_sequence="iiQQ")
+            camera_id = camera_properties[0]
+            model_id = camera_properties[1]
+            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
+            width = camera_properties[2]
+            height = camera_properties[3]
+            num_params = CAMERA_MODEL_IDS[model_id].num_params
+            params = read_next_bytes(fid, num_bytes=8*num_params,
+                                     format_char_sequence="d"*num_params)
+            cameras[camera_id] = Camera(id=camera_id,
+                                        model=model_name,
+                                        width=width,
+                                        height=height,
+                                        params=np.array(params))
+        assert len(cameras) == num_cameras
+    return cameras
+def read_extrinsics_text(path):
+    """
+    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
+    """
+    images = {}
+    with open(path, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != "#":
+                elems = line.split()
+                image_id = int(elems[0])
+                qvec = np.array(tuple(map(float, elems[1:5])))
+                tvec = np.array(tuple(map(float, elems[5:8])))
+                camera_id = int(elems[8])
+                image_name = elems[9]
+                elems = fid.readline().split()
+                xys = np.column_stack([tuple(map(float, elems[0::3])),
+                                       tuple(map(float, elems[1::3]))])
+                point3D_ids = np.array(tuple(map(int, elems[2::3])))
+                images[image_id] = Image(
+                    id=image_id, qvec=qvec, tvec=tvec,
+                    camera_id=camera_id, name=image_name,
+                    xys=xys, point3D_ids=point3D_ids)
+    return images
+def read_colmap_bin_array(path):
+    """
+    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_dense.py
+    :param path: path to the colmap binary file.
+    :return: nd array with the floating point values in the value
+    """
+    with open(path, "rb") as fid:
+        width, height, channels = np.genfromtxt(fid, delimiter="&", max_rows=1,
+                                                usecols=(0, 1, 2), dtype=int)
+        fid.seek(0)
+        num_delimiter = 0
+        byte = fid.read(1)
+        while True:
+            if byte == b"&":
+                num_delimiter += 1
+                if num_delimiter >= 3:
+                    break
+            byte = fid.read(1)
+        array = np.fromfile(fid, np.float32)
+    array = array.reshape((width, height, channels), order="F")
+    return np.transpose(array, (1, 0, 2)).squeeze()

utils/scene/dataset_readers.py ADDED Viewed

	@@ -0,0 +1,270 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import os
+import sys
+from PIL import Image
+from typing import NamedTuple
+from .colmap_loader import read_extrinsics_text, read_intrinsics_text, qvec2rotmat, \
+    read_extrinsics_binary, read_intrinsics_binary, read_points3D_binary, read_points3D_text
+from .utils.graphics_utils import getWorld2View2, focal2fov, fov2focal
+import numpy as np
+import json
+from pathlib import Path
+from plyfile import PlyData, PlyElement
+from .utils.sh_utils import SH2RGB
+from .gaussian_model import BasicPointCloud
+class CameraInfo(NamedTuple):
+    uid: int
+    R: np.array
+    T: np.array
+    FovY: np.array
+    FovX: np.array
+    image: np.array
+    image_path: str
+    image_name: str
+    mask: np.array
+    mask_path: str
+    width: int
+    height: int
+class SceneInfo(NamedTuple):
+    point_cloud: BasicPointCloud
+    train_cameras: list
+    test_cameras: list
+    nerf_normalization: dict
+    ply_path: str
+def getNerfppNorm(cam_info):
+    def get_center_and_diag(cam_centers):
+        cam_centers = np.hstack(cam_centers)
+        avg_cam_center = np.mean(cam_centers, axis=1, keepdims=True)
+        center = avg_cam_center
+        dist = np.linalg.norm(cam_centers - center, axis=0, keepdims=True)
+        diagonal = np.max(dist)
+        return center.flatten(), diagonal
+    cam_centers = []
+    for cam in cam_info:
+        W2C = getWorld2View2(cam.R, cam.T)
+        C2W = np.linalg.inv(W2C)
+        cam_centers.append(C2W[:3, 3:4])
+    center, diagonal = get_center_and_diag(cam_centers)
+    radius = diagonal * 1.1
+    translate = -center
+    return {"translate": translate, "radius": radius}
+def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder, masks_folder):
+    cam_infos = []
+    for idx, key in enumerate(cam_extrinsics):
+        sys.stdout.write('\r')
+        # the exact output you're looking for:
+        sys.stdout.write("Reading camera {}/{}".format(idx+1, len(cam_extrinsics)))
+        sys.stdout.flush()
+        extr = cam_extrinsics[key]
+        intr = cam_intrinsics[extr.camera_id]
+        height = intr.height
+        width = intr.width
+        uid = intr.id
+        R = np.transpose(qvec2rotmat(extr.qvec))
+        T = np.array(extr.tvec)
+        if intr.model=="SIMPLE_PINHOLE":
+            focal_length_x = intr.params[0]
+            FovY = focal2fov(focal_length_x, height)
+            FovX = focal2fov(focal_length_x, width)
+        elif intr.model=="PINHOLE":
+            focal_length_x = intr.params[0]
+            focal_length_y = intr.params[1]
+            FovY = focal2fov(focal_length_y, height)
+            FovX = focal2fov(focal_length_x, width)
+        else:
+            assert False, "Colmap camera model not handled: only undistorted datasets (PINHOLE or SIMPLE_PINHOLE cameras) supported!"
+        image_path = os.path.join(images_folder, os.path.basename(extr.name))
+        image_name = os.path.basename(image_path).split(".")[0]
+        image = Image.open(image_path)
+        mask_path = os.path.join(masks_folder, os.path.basename(extr.name).replace(".jpg", ".png"))
+        try:
+            mask = Image.open(mask_path)
+        except:
+            mask = None
+        cam_info = CameraInfo(uid=uid, R=R, T=T, FovY=FovY, FovX=FovX, image=image, mask=mask, mask_path=mask_path,
+                              image_path=image_path, image_name=image_name, width=width, height=height)
+        cam_infos.append(cam_info)
+    sys.stdout.write('\n')
+    return cam_infos
+def fetchPly(path):
+    plydata = PlyData.read(path)
+    vertices = plydata['vertex']
+    positions = np.vstack([vertices['x'], vertices['y'], vertices['z']]).T
+    colors = np.vstack([vertices['red'], vertices['green'], vertices['blue']]).T / 255.0
+    normals = np.vstack([vertices['nx'], vertices['ny'], vertices['nz']]).T
+    return BasicPointCloud(points=positions, colors=colors, normals=normals)
+def storePly(path, xyz, rgb):
+    # Define the dtype for the structured array
+    dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'),
+            ('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4'),
+            ('red', 'u1'), ('green', 'u1'), ('blue', 'u1')]
+    normals = np.zeros_like(xyz)
+    elements = np.empty(xyz.shape[0], dtype=dtype)
+    attributes = np.concatenate((xyz, normals, rgb), axis=1)
+    elements[:] = list(map(tuple, attributes))
+    # Create the PlyData object and write to file
+    vertex_element = PlyElement.describe(elements, 'vertex')
+    ply_data = PlyData([vertex_element])
+    ply_data.write(path)
+def readColmapSceneInfo(path, images, eval, llffhold=8):
+    try:
+        cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.bin")
+        cameras_intrinsic_file = os.path.join(path, "sparse/0", "cameras.bin")
+        cam_extrinsics = read_extrinsics_binary(cameras_extrinsic_file)
+        cam_intrinsics = read_intrinsics_binary(cameras_intrinsic_file)
+    except:
+        cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.txt")
+        cameras_intrinsic_file = os.path.join(path, "sparse/0", "cameras.txt")
+        cam_extrinsics = read_extrinsics_text(cameras_extrinsic_file)
+        cam_intrinsics = read_intrinsics_text(cameras_intrinsic_file)
+    reading_dir = "images" if images == None else images
+    # FIXME in post
+    mask_reading_dir = "masks"# if images == None else images
+    cam_infos_unsorted = readColmapCameras(cam_extrinsics=cam_extrinsics, cam_intrinsics=cam_intrinsics, images_folder=os.path.join(path, reading_dir), masks_folder=os.path.join(path, mask_reading_dir))
+    cam_infos = sorted(cam_infos_unsorted.copy(), key = lambda x : x.image_name)
+    if eval:
+        train_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold != 0]
+        test_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold == 0]
+    else:
+        train_cam_infos = cam_infos
+        test_cam_infos = []
+    nerf_normalization = getNerfppNorm(train_cam_infos)
+    ply_path = os.path.join(path, "sparse/0/points3D.ply")
+    bin_path = os.path.join(path, "sparse/0/points3D.bin")
+    txt_path = os.path.join(path, "sparse/0/points3D.txt")
+    if not os.path.exists(ply_path):
+        print("Converting point3d.bin to .ply, will happen only the first time you open the scene.")
+        try:
+            xyz, rgb, _ = read_points3D_binary(bin_path)
+        except:
+            xyz, rgb, _ = read_points3D_text(txt_path)
+        storePly(ply_path, xyz, rgb)
+    try:
+        pcd = fetchPly(ply_path)
+    except:
+        pcd = None
+    scene_info = SceneInfo(point_cloud=pcd,
+                           train_cameras=train_cam_infos,
+                           test_cameras=test_cam_infos,
+                           nerf_normalization=nerf_normalization,
+                           ply_path=ply_path)
+    return scene_info
+def readCamerasFromTransforms(path, transformsfile, white_background, extension=".png"):
+    cam_infos = []
+    with open(os.path.join(path, transformsfile)) as json_file:
+        contents = json.load(json_file)
+        fovx = contents["camera_angle_x"]
+        frames = contents["frames"]
+        for idx, frame in enumerate(frames):
+            cam_name = os.path.join(path, frame["file_path"] + extension)
+            # NeRF 'transform_matrix' is a camera-to-world transform
+            c2w = np.array(frame["transform_matrix"])
+            # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
+            c2w[:3, 1:3] *= -1
+            # get the world-to-camera transform and set R, T
+            w2c = np.linalg.inv(c2w)
+            R = np.transpose(w2c[:3,:3])  # R is stored transposed due to 'glm' in CUDA code
+            T = w2c[:3, 3]
+            image_path = os.path.join(path, cam_name)
+            image_name = Path(cam_name).stem
+            image = Image.open(image_path)
+            im_data = np.array(image.convert("RGBA"))
+            bg = np.array([1,1,1]) if white_background else np.array([0, 0, 0])
+            norm_data = im_data / 255.0
+            arr = norm_data[:,:,:3] * norm_data[:, :, 3:4] + bg * (1 - norm_data[:, :, 3:4])
+            image = Image.fromarray(np.array(arr*255.0, dtype=np.byte), "RGB")
+            fovy = focal2fov(fov2focal(fovx, image.size[0]), image.size[1])
+            FovY = fovy
+            FovX = fovx
+            cam_infos.append(CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image,
+                            image_path=image_path, image_name=image_name, width=image.size[0], height=image.size[1]))
+    return cam_infos
+def readNerfSyntheticInfo(path, white_background, eval, extension=".png"):
+    print("Reading Training Transforms")
+    train_cam_infos = readCamerasFromTransforms(path, "transforms_train.json", white_background, extension)
+    print("Reading Test Transforms")
+    test_cam_infos = readCamerasFromTransforms(path, "transforms_test.json", white_background, extension)
+    if not eval:
+        train_cam_infos.extend(test_cam_infos)
+        test_cam_infos = []
+    nerf_normalization = getNerfppNorm(train_cam_infos)
+    ply_path = os.path.join(path, "points3d.ply")
+    if not os.path.exists(ply_path):
+        # Since this data set has no colmap data, we start with random points
+        num_pts = 100_000
+        print(f"Generating random point cloud ({num_pts})...")
+        # We create random points inside the bounds of the synthetic Blender scenes
+        xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3
+        shs = np.random.random((num_pts, 3)) / 255.0
+        pcd = BasicPointCloud(points=xyz, colors=SH2RGB(shs), normals=np.zeros((num_pts, 3)))
+        storePly(ply_path, xyz, SH2RGB(shs) * 255)
+    try:
+        pcd = fetchPly(ply_path)
+    except:
+        pcd = None
+    scene_info = SceneInfo(point_cloud=pcd,
+                           train_cameras=train_cam_infos,
+                           test_cameras=test_cam_infos,
+                           nerf_normalization=nerf_normalization,
+                           ply_path=ply_path)
+    return scene_info
+sceneLoadTypeCallbacks = {
+    "Colmap": readColmapSceneInfo,
+    "Blender" : readNerfSyntheticInfo
+}

utils/scene/gaussian_model.py ADDED Viewed

	@@ -0,0 +1,416 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+import numpy as np
+from .utils.general_utils import inverse_sigmoid, get_expon_lr_func, build_rotation
+from torch import nn
+import os
+from .utils.system_utils import mkdir_p
+from plyfile import PlyData, PlyElement
+from .utils.sh_utils import RGB2SH
+from .utils.graphics_utils import BasicPointCloud
+from .utils.general_utils import strip_symmetric, build_scaling_rotation
+from scipy.spatial import KDTree
+# credit to https://github.com/graphdeco-inria/gaussian-splatting/issues/292#issuecomment-2007934451
+def distCUDA2(points):
+    points_np = points.detach().cpu().float().numpy()
+    dists, inds = KDTree(points_np).query(points_np, k=4)
+    meanDists = (dists[:, 1:] ** 2).mean(1)
+    return torch.tensor(meanDists, dtype=points.dtype, device=points.device)
+class GaussianModel:
+    def setup_functions(self):
+        def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation):
+            L = build_scaling_rotation(scaling_modifier * scaling, rotation)
+            actual_covariance = L @ L.transpose(1, 2)
+            symm = strip_symmetric(actual_covariance)
+            return symm
+        self.scaling_activation = torch.exp
+        self.scaling_inverse_activation = torch.log
+        self.covariance_activation = build_covariance_from_scaling_rotation
+        self.opacity_activation = torch.sigmoid
+        self.inverse_opacity_activation = inverse_sigmoid
+        self.rotation_activation = torch.nn.functional.normalize
+    def __init__(self, sh_degree : int):
+        self.active_sh_degree = 0
+        self.max_sh_degree = sh_degree
+        self._xyz = torch.empty(0)
+        self._features_dc = torch.empty(0)
+        self._features_rest = torch.empty(0)
+        self._scaling = torch.empty(0)
+        self._rotation = torch.empty(0)
+        self._opacity = torch.empty(0)
+        self.max_radii2D = torch.empty(0)
+        self.xyz_gradient_accum = torch.empty(0)
+        self.denom = torch.empty(0)
+        self.optimizer = None
+        self.percent_dense = 0
+        self.spatial_lr_scale = 0
+        self.setup_functions()
+    def capture(self):
+        return (
+            self.active_sh_degree,
+            self._xyz,
+            self._features_dc,
+            self._features_rest,
+            self._scaling,
+            self._rotation,
+            self._opacity,
+            self.max_radii2D,
+            self.xyz_gradient_accum,
+            self.denom,
+            self.optimizer.state_dict(),
+            self.spatial_lr_scale,
+        )
+    def restore(self, model_args, training_args):
+        (self.active_sh_degree,
+        self._xyz,
+        self._features_dc,
+        self._features_rest,
+        self._scaling,
+        self._rotation,
+        self._opacity,
+        self.max_radii2D,
+        xyz_gradient_accum,
+        denom,
+        opt_dict,
+        self.spatial_lr_scale) = model_args
+        self.training_setup(training_args)
+        self.xyz_gradient_accum = xyz_gradient_accum
+        self.denom = denom
+        self.optimizer.load_state_dict(opt_dict)
+    @property
+    def get_scaling(self):
+        return self.scaling_activation(self._scaling)
+    @property
+    def get_rotation(self):
+        return self.rotation_activation(self._rotation)
+    @property
+    def get_xyz(self):
+        return self._xyz
+    @property
+    def get_features(self):
+        features_dc = self._features_dc
+        features_rest = self._features_rest
+        return torch.cat((features_dc, features_rest), dim=1)
+    @property
+    def get_opacity(self):
+        return self.opacity_activation(self._opacity)
+    def get_covariance(self, scaling_modifier = 1):
+        return self.covariance_activation(self.get_scaling, scaling_modifier, self._rotation)
+    def oneupSHdegree(self):
+        if self.active_sh_degree < self.max_sh_degree:
+            self.active_sh_degree += 1
+    def create_from_pcd(self, pcd : BasicPointCloud, spatial_lr_scale : float):
+        self.spatial_lr_scale = spatial_lr_scale
+        fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda()
+        fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda())
+        features = torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2)).float().cuda()
+        features[:, :3, 0 ] = fused_color
+        features[:, 3:, 1:] = 0.0
+        print("Number of points at initialisation : ", fused_point_cloud.shape[0])
+        dist2 = torch.clamp_min(distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()), 0.0000001)
+        scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 3)
+        rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda")
+        rots[:, 0] = 1
+        opacities = inverse_sigmoid(0.1 * torch.ones((fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda"))
+        self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True))
+        self._features_dc = nn.Parameter(features[:,:,0:1].transpose(1, 2).contiguous().requires_grad_(True))
+        self._features_rest = nn.Parameter(features[:,:,1:].transpose(1, 2).contiguous().requires_grad_(True))
+        self._scaling = nn.Parameter(scales.requires_grad_(True))
+        self._rotation = nn.Parameter(rots.requires_grad_(True))
+        self._opacity = nn.Parameter(opacities.requires_grad_(True))
+        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")
+    def training_setup(self, training_args):
+        self.percent_dense = training_args.percent_dense
+        self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        l = [
+            {'params': [self._xyz], 'lr': training_args.position_lr_init * self.spatial_lr_scale, "name": "xyz"},
+            {'params': [self._features_dc], 'lr': training_args.feature_lr, "name": "f_dc"},
+            {'params': [self._features_rest], 'lr': training_args.feature_lr / 20.0, "name": "f_rest"},
+            {'params': [self._opacity], 'lr': training_args.opacity_lr, "name": "opacity"},
+            {'params': [self._scaling], 'lr': training_args.scaling_lr, "name": "scaling"},
+            {'params': [self._rotation], 'lr': training_args.rotation_lr, "name": "rotation"}
+        ]
+        self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)
+        self.xyz_scheduler_args = get_expon_lr_func(lr_init=training_args.position_lr_init*self.spatial_lr_scale,
+                                                    lr_final=training_args.position_lr_final*self.spatial_lr_scale,
+                                                    lr_delay_mult=training_args.position_lr_delay_mult,
+                                                    max_steps=training_args.position_lr_max_steps)
+    def update_learning_rate(self, iteration):
+        ''' Learning rate scheduling per step '''
+        for param_group in self.optimizer.param_groups:
+            if param_group["name"] == "xyz":
+                lr = self.xyz_scheduler_args(iteration)
+                param_group['lr'] = lr
+                return lr
+    def construct_list_of_attributes(self):
+        l = ['x', 'y', 'z', 'nx', 'ny', 'nz']
+        # All channels except the 3 DC
+        for i in range(self._features_dc.shape[1]*self._features_dc.shape[2]):
+            l.append('f_dc_{}'.format(i))
+        for i in range(self._features_rest.shape[1]*self._features_rest.shape[2]):
+            l.append('f_rest_{}'.format(i))
+        l.append('opacity')
+        for i in range(self._scaling.shape[1]):
+            l.append('scale_{}'.format(i))
+        for i in range(self._rotation.shape[1]):
+            l.append('rot_{}'.format(i))
+        return l
+    def save_ply(self, path):
+        mkdir_p(os.path.dirname(path))
+        xyz = self._xyz.detach().cpu().numpy()
+        normals = np.zeros_like(xyz)
+        f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        f_rest = self._features_rest.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        opacities = self._opacity.detach().cpu().numpy()
+        scale = self._scaling.detach().cpu().numpy()
+        rotation = self._rotation.detach().cpu().numpy()
+        dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()]
+        elements = np.empty(xyz.shape[0], dtype=dtype_full)
+        attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1)
+        elements[:] = list(map(tuple, attributes))
+        el = PlyElement.describe(elements, 'vertex')
+        PlyData([el]).write(path)
+    def reset_opacity(self):
+        opacities_new = inverse_sigmoid(torch.min(self.get_opacity, torch.ones_like(self.get_opacity)*0.01))
+        optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity")
+        self._opacity = optimizable_tensors["opacity"]
+    def load_ply(self, path):
+        plydata = PlyData.read(path)
+        xyz = np.stack((np.asarray(plydata.elements[0]["x"]),
+                        np.asarray(plydata.elements[0]["y"]),
+                        np.asarray(plydata.elements[0]["z"])),  axis=1)
+        opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis]
+        features_dc = np.zeros((xyz.shape[0], 3, 1))
+        features_dc[:, 0, 0] = np.asarray(plydata.elements[0]["f_dc_0"])
+        features_dc[:, 1, 0] = np.asarray(plydata.elements[0]["f_dc_1"])
+        features_dc[:, 2, 0] = np.asarray(plydata.elements[0]["f_dc_2"])
+        extra_f_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("f_rest_")]
+        extra_f_names = sorted(extra_f_names, key = lambda x: int(x.split('_')[-1]))
+        assert len(extra_f_names)==3*(self.max_sh_degree + 1) ** 2 - 3
+        features_extra = np.zeros((xyz.shape[0], len(extra_f_names)))
+        for idx, attr_name in enumerate(extra_f_names):
+            features_extra[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        # Reshape (P,F*SH_coeffs) to (P, F, SH_coeffs except DC)
+        features_extra = features_extra.reshape((features_extra.shape[0], 3, (self.max_sh_degree + 1) ** 2 - 1))
+        scale_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("scale_")]
+        scale_names = sorted(scale_names, key = lambda x: int(x.split('_')[-1]))
+        scales = np.zeros((xyz.shape[0], len(scale_names)))
+        for idx, attr_name in enumerate(scale_names):
+            scales[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        rot_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("rot")]
+        rot_names = sorted(rot_names, key = lambda x: int(x.split('_')[-1]))
+        rots = np.zeros((xyz.shape[0], len(rot_names)))
+        for idx, attr_name in enumerate(rot_names):
+            rots[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        self._xyz = nn.Parameter(torch.tensor(xyz, dtype=torch.float, device="cuda").requires_grad_(True))
+        self._features_dc = nn.Parameter(torch.tensor(features_dc, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True))
+        self._features_rest = nn.Parameter(torch.tensor(features_extra, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True))
+        self._opacity = nn.Parameter(torch.tensor(opacities, dtype=torch.float, device="cuda").requires_grad_(True))
+        self._scaling = nn.Parameter(torch.tensor(scales, dtype=torch.float, device="cuda").requires_grad_(True))
+        self._rotation = nn.Parameter(torch.tensor(rots, dtype=torch.float, device="cuda").requires_grad_(True))
+        self.active_sh_degree = self.max_sh_degree
+    def replace_tensor_to_optimizer(self, tensor, name):
+        optimizable_tensors = {}
+        for group in self.optimizer.param_groups:
+            if group["name"] == name:
+                stored_state = self.optimizer.state.get(group['params'][0], None)
+                stored_state["exp_avg"] = torch.zeros_like(tensor)
+                stored_state["exp_avg_sq"] = torch.zeros_like(tensor)
+                del self.optimizer.state[group['params'][0]]
+                group["params"][0] = nn.Parameter(tensor.requires_grad_(True))
+                self.optimizer.state[group['params'][0]] = stored_state
+                optimizable_tensors[group["name"]] = group["params"][0]
+        return optimizable_tensors
+    def _prune_optimizer(self, mask):
+        optimizable_tensors = {}
+        for group in self.optimizer.param_groups:
+            stored_state = self.optimizer.state.get(group['params'][0], None)
+            if stored_state is not None:
+                stored_state["exp_avg"] = stored_state["exp_avg"][mask]
+                stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask]
+                del self.optimizer.state[group['params'][0]]
+                group["params"][0] = nn.Parameter((group["params"][0][mask].requires_grad_(True)))
+                self.optimizer.state[group['params'][0]] = stored_state
+                optimizable_tensors[group["name"]] = group["params"][0]
+            else:
+                group["params"][0] = nn.Parameter(group["params"][0][mask].requires_grad_(True))
+                optimizable_tensors[group["name"]] = group["params"][0]
+        return optimizable_tensors
+    def prune_points(self, mask):
+        valid_points_mask = ~mask
+        optimizable_tensors = self._prune_optimizer(valid_points_mask)
+        self._xyz = optimizable_tensors["xyz"]
+        self._features_dc = optimizable_tensors["f_dc"]
+        self._features_rest = optimizable_tensors["f_rest"]
+        self._opacity = optimizable_tensors["opacity"]
+        self._scaling = optimizable_tensors["scaling"]
+        self._rotation = optimizable_tensors["rotation"]
+        self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask]
+        self.denom = self.denom[valid_points_mask]
+        self.max_radii2D = self.max_radii2D[valid_points_mask]
+    def cat_tensors_to_optimizer(self, tensors_dict):
+        optimizable_tensors = {}
+        for group in self.optimizer.param_groups:
+            assert len(group["params"]) == 1
+            extension_tensor = tensors_dict[group["name"]]
+            stored_state = self.optimizer.state.get(group['params'][0], None)
+            if stored_state is not None:
+                stored_state["exp_avg"] = torch.cat((stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0)
+                stored_state["exp_avg_sq"] = torch.cat((stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)), dim=0)
+                del self.optimizer.state[group['params'][0]]
+                group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True))
+                self.optimizer.state[group['params'][0]] = stored_state
+                optimizable_tensors[group["name"]] = group["params"][0]
+            else:
+                group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True))
+                optimizable_tensors[group["name"]] = group["params"][0]
+        return optimizable_tensors
+    def densification_postfix(self, new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation):
+        d = {"xyz": new_xyz,
+        "f_dc": new_features_dc,
+        "f_rest": new_features_rest,
+        "opacity": new_opacities,
+        "scaling" : new_scaling,
+        "rotation" : new_rotation}
+        optimizable_tensors = self.cat_tensors_to_optimizer(d)
+        self._xyz = optimizable_tensors["xyz"]
+        self._features_dc = optimizable_tensors["f_dc"]
+        self._features_rest = optimizable_tensors["f_rest"]
+        self._opacity = optimizable_tensors["opacity"]
+        self._scaling = optimizable_tensors["scaling"]
+        self._rotation = optimizable_tensors["rotation"]
+        self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
+        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")
+    def densify_and_split(self, grads, grad_threshold, scene_extent, N=2):
+        n_init_points = self.get_xyz.shape[0]
+        # Extract points that satisfy the gradient condition
+        padded_grad = torch.zeros((n_init_points), device="cuda")
+        padded_grad[:grads.shape[0]] = grads.squeeze()
+        selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False)
+        selected_pts_mask = torch.logical_and(selected_pts_mask,
+                                              torch.max(self.get_scaling, dim=1).values > self.percent_dense*scene_extent)
+        stds = self.get_scaling[selected_pts_mask].repeat(N,1)
+        means =torch.zeros((stds.size(0), 3),device="cuda")
+        samples = torch.normal(mean=means, std=stds)
+        rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N,1,1)
+        new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[selected_pts_mask].repeat(N, 1)
+        new_scaling = self.scaling_inverse_activation(self.get_scaling[selected_pts_mask].repeat(N,1) / (0.8*N))
+        new_rotation = self._rotation[selected_pts_mask].repeat(N,1)
+        new_features_dc = self._features_dc[selected_pts_mask].repeat(N,1,1)
+        new_features_rest = self._features_rest[selected_pts_mask].repeat(N,1,1)
+        new_opacity = self._opacity[selected_pts_mask].repeat(N,1)
+        self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation)
+        prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool)))
+        self.prune_points(prune_filter)
+    def densify_and_clone(self, grads, grad_threshold, scene_extent):
+        # Extract points that satisfy the gradient condition
+        selected_pts_mask = torch.where(torch.norm(grads, dim=-1) >= grad_threshold, True, False)
+        selected_pts_mask = torch.logical_and(selected_pts_mask,
+                                              torch.max(self.get_scaling, dim=1).values <= self.percent_dense*scene_extent)
+        new_xyz = self._xyz[selected_pts_mask]
+        new_features_dc = self._features_dc[selected_pts_mask]
+        new_features_rest = self._features_rest[selected_pts_mask]
+        new_opacities = self._opacity[selected_pts_mask]
+        new_scaling = self._scaling[selected_pts_mask]
+        new_rotation = self._rotation[selected_pts_mask]
+        self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation)
+    def densify_and_prune(self, max_grad, min_opacity, extent, max_screen_size):
+        grads = self.xyz_gradient_accum / self.denom
+        grads[grads.isnan()] = 0.0
+        self.densify_and_clone(grads, max_grad, extent)
+        self.densify_and_split(grads, max_grad, extent)
+        prune_mask = (self.get_opacity < min_opacity).squeeze()
+        if max_screen_size:
+            big_points_vs = self.max_radii2D > max_screen_size
+            big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent
+            prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws)
+        self.prune_points(prune_mask)
+        torch.cuda.empty_cache()
+    def add_densification_stats(self, viewspace_point_tensor, update_filter):
+        self.xyz_gradient_accum[update_filter] += torch.norm(viewspace_point_tensor.grad[update_filter,:2], dim=-1, keepdim=True)
+        self.denom[update_filter] += 1

utils/scene/utils/camera_utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+from ..cameras import Camera
+import numpy as np
+from .general_utils import PILtoTorch
+from .graphics_utils import fov2focal
+WARNED = False
+def loadCam(args, id, cam_info, resolution_scale):
+    orig_w, orig_h = cam_info.image.size
+    if args.resolution in [1, 2, 4, 8]:
+        resolution = round(orig_w/(resolution_scale * args.resolution)), round(orig_h/(resolution_scale * args.resolution))
+    else:  # should be a type that converts to float
+        if args.resolution == -1:
+            if orig_w > 1600:
+                global WARNED
+                if not WARNED:
+                    print("[ INFO ] Encountered quite large input images (>1.6K pixels width), rescaling to 1.6K.\n "
+                        "If this is not desired, please explicitly specify '--resolution/-r' as 1")
+                    WARNED = True
+                global_down = orig_w / 1600
+            else:
+                global_down = 1
+        else:
+            global_down = orig_w / args.resolution
+        scale = float(global_down) * float(resolution_scale)
+        resolution = (int(orig_w / scale), int(orig_h / scale))
+    resized_image_rgb = PILtoTorch(cam_info.image, resolution)
+    gt_image = resized_image_rgb[:3, ...]
+    loaded_mask = None
+    if resized_image_rgb.shape[1] == 4:
+        loaded_mask = resized_image_rgb[3:4, ...]
+    elif cam_info.mask is not None:
+        loaded_mask = ~(PILtoTorch(cam_info.mask, resolution)[0:1, ...] > 0)
+    return Camera(colmap_id=cam_info.uid, R=cam_info.R, T=cam_info.T,
+                  FoVx=cam_info.FovX, FoVy=cam_info.FovY,
+                  image=gt_image, gt_alpha_mask=loaded_mask,
+                  image_name=cam_info.image_name, uid=id, data_device=args.data_device)
+def cameraList_from_camInfos(cam_infos, resolution_scale, args):
+    camera_list = []
+    for id, c in enumerate(cam_infos):
+        camera_list.append(loadCam(args, id, c, resolution_scale))
+    return camera_list
+def camera_to_JSON(id, camera : Camera):
+    Rt = np.zeros((4, 4))
+    Rt[:3, :3] = camera.R.transpose()
+    Rt[:3, 3] = camera.T
+    Rt[3, 3] = 1.0
+    W2C = np.linalg.inv(Rt)
+    pos = W2C[:3, 3]
+    rot = W2C[:3, :3]
+    serializable_array_2d = [x.tolist() for x in rot]
+    camera_entry = {
+        'id' : id,
+        'img_name' : camera.image_name,
+        'width' : camera.width,
+        'height' : camera.height,
+        'position': pos.tolist(),
+        'rotation': serializable_array_2d,
+        'fy' : fov2focal(camera.FovY, camera.height),
+        'fx' : fov2focal(camera.FovX, camera.width)
+    }
+    return camera_entry

utils/scene/utils/general_utils.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+import sys
+from datetime import datetime
+import numpy as np
+import random
+def inverse_sigmoid(x):
+    return torch.log(x/(1-x))
+def PILtoTorch(pil_image, resolution):
+    resized_image_PIL = pil_image.resize(resolution)
+    resized_image = torch.from_numpy(np.array(resized_image_PIL)) / 255.0
+    if len(resized_image.shape) == 3:
+        return resized_image.permute(2, 0, 1)
+    else:
+        return resized_image.unsqueeze(dim=-1).permute(2, 0, 1)
+def get_expon_lr_func(
+    lr_init, lr_final, lr_delay_steps=0, lr_delay_mult=1.0, max_steps=1000000
+):
+    """
+    Copied from Plenoxels
+    Continuous learning rate decay function. Adapted from JaxNeRF
+    The returned rate is lr_init when step=0 and lr_final when step=max_steps, and
+    is log-linearly interpolated elsewhere (equivalent to exponential decay).
+    If lr_delay_steps>0 then the learning rate will be scaled by some smooth
+    function of lr_delay_mult, such that the initial learning rate is
+    lr_init*lr_delay_mult at the beginning of optimization but will be eased back
+    to the normal learning rate when steps>lr_delay_steps.
+    :param conf: config subtree 'lr' or similar
+    :param max_steps: int, the number of steps during optimization.
+    :return HoF which takes step as input
+    """
+    def helper(step):
+        if step < 0 or (lr_init == 0.0 and lr_final == 0.0):
+            # Disable this parameter
+            return 0.0
+        if lr_delay_steps > 0:
+            # A kind of reverse cosine decay.
+            delay_rate = lr_delay_mult + (1 - lr_delay_mult) * np.sin(
+                0.5 * np.pi * np.clip(step / lr_delay_steps, 0, 1)
+            )
+        else:
+            delay_rate = 1.0
+        t = np.clip(step / max_steps, 0, 1)
+        log_lerp = np.exp(np.log(lr_init) * (1 - t) + np.log(lr_final) * t)
+        return delay_rate * log_lerp
+    return helper
+def strip_lowerdiag(L):
+    uncertainty = torch.zeros((L.shape[0], 6), dtype=torch.float, device="cuda")
+    uncertainty[:, 0] = L[:, 0, 0]
+    uncertainty[:, 1] = L[:, 0, 1]
+    uncertainty[:, 2] = L[:, 0, 2]
+    uncertainty[:, 3] = L[:, 1, 1]
+    uncertainty[:, 4] = L[:, 1, 2]
+    uncertainty[:, 5] = L[:, 2, 2]
+    return uncertainty
+def strip_symmetric(sym):
+    return strip_lowerdiag(sym)
+def build_rotation(r):
+    norm = torch.sqrt(r[:,0]*r[:,0] + r[:,1]*r[:,1] + r[:,2]*r[:,2] + r[:,3]*r[:,3])
+    q = r / norm[:, None]
+    R = torch.zeros((q.size(0), 3, 3), device='cuda')
+    r = q[:, 0]
+    x = q[:, 1]
+    y = q[:, 2]
+    z = q[:, 3]
+    R[:, 0, 0] = 1 - 2 * (y*y + z*z)
+    R[:, 0, 1] = 2 * (x*y - r*z)
+    R[:, 0, 2] = 2 * (x*z + r*y)
+    R[:, 1, 0] = 2 * (x*y + r*z)
+    R[:, 1, 1] = 1 - 2 * (x*x + z*z)
+    R[:, 1, 2] = 2 * (y*z - r*x)
+    R[:, 2, 0] = 2 * (x*z - r*y)
+    R[:, 2, 1] = 2 * (y*z + r*x)
+    R[:, 2, 2] = 1 - 2 * (x*x + y*y)
+    return R
+def build_scaling_rotation(s, r):
+    L = torch.zeros((s.shape[0], 3, 3), dtype=torch.float, device="cuda")
+    R = build_rotation(r)
+    L[:,0,0] = s[:,0]
+    L[:,1,1] = s[:,1]
+    L[:,2,2] = s[:,2]
+    L = R @ L
+    return L
+def safe_state(silent):
+    old_f = sys.stdout
+    class F:
+        def __init__(self, silent):
+            self.silent = silent
+        def write(self, x):
+            if not self.silent:
+                if x.endswith("\n"):
+                    old_f.write(x.replace("\n", " [{}]\n".format(str(datetime.now().strftime("%d/%m %H:%M:%S")))))
+                else:
+                    old_f.write(x)
+        def flush(self):
+            old_f.flush()
+    sys.stdout = F(silent)
+    random.seed(0)
+    np.random.seed(0)
+    torch.manual_seed(0)
+    torch.cuda.set_device(torch.device("cuda:0"))

utils/scene/utils/graphics_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+import math
+import numpy as np
+from typing import NamedTuple
+class BasicPointCloud(NamedTuple):
+    points : np.array
+    colors : np.array
+    normals : np.array
+def geom_transform_points(points, transf_matrix):
+    P, _ = points.shape
+    ones = torch.ones(P, 1, dtype=points.dtype, device=points.device)
+    points_hom = torch.cat([points, ones], dim=1)
+    points_out = torch.matmul(points_hom, transf_matrix.unsqueeze(0))
+    denom = points_out[..., 3:] + 0.0000001
+    return (points_out[..., :3] / denom).squeeze(dim=0)
+def getWorld2View(R, t):
+    Rt = np.zeros((4, 4))
+    Rt[:3, :3] = R.transpose()
+    Rt[:3, 3] = t
+    Rt[3, 3] = 1.0
+    return np.float32(Rt)
+def getWorld2View2(R, t, translate=np.array([.0, .0, .0]), scale=1.0):
+    Rt = np.zeros((4, 4))
+    Rt[:3, :3] = R.transpose()
+    Rt[:3, 3] = t
+    Rt[3, 3] = 1.0
+    C2W = np.linalg.inv(Rt)
+    cam_center = C2W[:3, 3]
+    cam_center = (cam_center + translate) * scale
+    C2W[:3, 3] = cam_center
+    Rt = np.linalg.inv(C2W)
+    return np.float32(Rt)
+def getProjectionMatrix(znear, zfar, fovX, fovY, crop_box=None, width=None, height=None):
+    tanHalfFovY = math.tan((fovY / 2))
+    tanHalfFovX = math.tan((fovX / 2))
+    top = tanHalfFovY * znear
+    bottom = -top
+    right = tanHalfFovX * znear
+    left = -right
+    frustum_width = right - left
+    frustum_height = top - bottom
+    if crop_box is not None:
+        assert width is not None and height is not None
+        x, y, w, h = crop_box
+        left = left + x / width * frustum_width
+        right = left + w / width * frustum_width
+        top = top - y / height * frustum_height
+        bottom = top - h / height * frustum_height
+    P = torch.zeros(4, 4)
+    z_sign = 1.0
+    P[0, 0] = 2.0 * znear / (right - left)
+    P[1, 1] = 2.0 * znear / (top - bottom)
+    P[0, 2] = (right + left) / (right - left)
+    P[1, 2] = (top + bottom) / (top - bottom)
+    P[3, 2] = z_sign
+    P[2, 2] = z_sign * zfar / (zfar - znear)
+    P[2, 3] = -(zfar * znear) / (zfar - znear)
+    return P
+def fov2focal(fov, pixels):
+    return pixels / (2 * math.tan(fov / 2))
+def focal2fov(focal, pixels):
+    return 2*math.atan(pixels/(2*focal))

utils/scene/utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+def mse(img1, img2):
+    return (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
+def psnr(img1, img2):
+    mse = (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
+    return 20 * torch.log10(1.0 / torch.sqrt(mse))

utils/scene/utils/loss_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+from math import exp
+def l1_loss(network_output, gt, reduce=True):
+    l1_loss = torch.abs((network_output - gt))
+    return l1_loss.mean() if reduce else l1_loss
+def l2_loss(network_output, gt):
+    return ((network_output - gt) ** 2).mean()
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
+    return gauss / gauss.sum()
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+    window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
+    return window
+def ssim(img1, img2, window_size=11, size_average=True):
+    channel = img1.size(-3)
+    window = create_window(window_size, channel)
+    if img1.is_cuda:
+        window = window.cuda(img1.get_device())
+    window = window.type_as(img1)
+    return _ssim(img1, img2, window, window_size, channel, size_average)
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+    C1 = 0.01 ** 2
+    C2 = 0.03 ** 2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1).mean(1).mean(1)

utils/scene/utils/sh_utils.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#  Copyright 2021 The PlenOctree Authors.
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+import torch
+C0 = 0.28209479177387814
+C1 = 0.4886025119029199
+C2 = [
+    1.0925484305920792,
+    -1.0925484305920792,
+    0.31539156525252005,
+    -1.0925484305920792,
+    0.5462742152960396
+]
+C3 = [
+    -0.5900435899266435,
+    2.890611442640554,
+    -0.4570457994644658,
+    0.3731763325901154,
+    -0.4570457994644658,
+    1.445305721320277,
+    -0.5900435899266435
+]
+C4 = [
+    2.5033429417967046,
+    -1.7701307697799304,
+    0.9461746957575601,
+    -0.6690465435572892,
+    0.10578554691520431,
+    -0.6690465435572892,
+    0.47308734787878004,
+    -1.7701307697799304,
+    0.6258357354491761,
+]
+def eval_sh(deg, sh, dirs):
+    """
+    Evaluate spherical harmonics at unit directions
+    using hardcoded SH polynomials.
+    Works with torch/np/jnp.
+    ... Can be 0 or more batch dimensions.
+    Args:
+        deg: int SH deg. Currently, 0-3 supported
+        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
+        dirs: jnp.ndarray unit directions [..., 3]
+    Returns:
+        [..., C]
+    """
+    assert deg <= 4 and deg >= 0
+    coeff = (deg + 1) ** 2
+    assert sh.shape[-1] >= coeff
+    result = C0 * sh[..., 0]
+    if deg > 0:
+        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
+        result = (result -
+                C1 * y * sh[..., 1] +
+                C1 * z * sh[..., 2] -
+                C1 * x * sh[..., 3])
+        if deg > 1:
+            xx, yy, zz = x * x, y * y, z * z
+            xy, yz, xz = x * y, y * z, x * z
+            result = (result +
+                    C2[0] * xy * sh[..., 4] +
+                    C2[1] * yz * sh[..., 5] +
+                    C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
+                    C2[3] * xz * sh[..., 7] +
+                    C2[4] * (xx - yy) * sh[..., 8])
+            if deg > 2:
+                result = (result +
+                C3[0] * y * (3 * xx - yy) * sh[..., 9] +
+                C3[1] * xy * z * sh[..., 10] +
+                C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
+                C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
+                C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
+                C3[5] * z * (xx - yy) * sh[..., 14] +
+                C3[6] * x * (xx - 3 * yy) * sh[..., 15])
+                if deg > 3:
+                    result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
+                            C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
+                            C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
+                            C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
+                            C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
+                            C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
+                            C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
+                            C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
+                            C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
+    return result
+def RGB2SH(rgb):
+    return (rgb - 0.5) / C0
+def SH2RGB(sh):
+    return sh * C0 + 0.5

utils/scene/utils/system_utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  [email protected]
+#
+from errno import EEXIST
+from os import makedirs, path
+import os
+def mkdir_p(folder_path):
+    # Creates a directory. equivalent to using mkdir -p on the command line
+    try:
+        makedirs(folder_path)
+    except OSError as exc: # Python >2.5
+        if exc.errno == EEXIST and path.isdir(folder_path):
+            pass
+        else:
+            raise
+def searchForMaxIteration(folder):
+    saved_iters = [int(fname.split("_")[-1]) for fname in os.listdir(folder)]
+    return max(saved_iters)

zoedepth/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Intelligent Systems Lab Org
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

zoedepth/data/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat

zoedepth/data/data_mono.py ADDED Viewed

	@@ -0,0 +1,697 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+# This file is partly inspired from BTS (https://github.com/cleinc/bts/blob/master/pytorch/bts_dataloader.py); author: Jin Han Lee
+import itertools
+import os
+import random
+from random import choice
+import numpy as np
+import cv2
+import torch
+import torch.nn as nn
+import torch.utils.data.distributed
+from zoedepth.utils.easydict import EasyDict as edict
+from PIL import Image, ImageOps
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from zoedepth.utils.config import change_dataset
+from .ddad import get_ddad_loader
+from .diml_indoor_test import get_diml_indoor_loader
+from .diml_outdoor_test import get_diml_outdoor_loader
+from .diode import get_diode_loader
+from .hypersim import get_hypersim_loader
+from .ibims import get_ibims_loader
+from .sun_rgbd_loader import get_sunrgbd_loader
+from .vkitti import get_vkitti_loader
+from .vkitti2 import get_vkitti2_loader
+from .places365 import get_places365_loader, Places365
+from .marigold_nyu import get_marigold_nyu_loader, MarigoldNYU
+from .preprocess import CropParams, get_white_border, get_black_border
+def _is_pil_image(img):
+    return isinstance(img, Image.Image)
+def _is_numpy_image(img):
+    return isinstance(img, np.ndarray) and (img.ndim in {2, 3})
+def preprocessing_transforms(mode, **kwargs):
+    return transforms.Compose([
+        ToTensor(mode=mode, **kwargs)
+    ])
+class DepthDataLoader(object):
+    def __init__(self, config, mode, device='cpu', transform=None, **kwargs):
+        """
+        Data loader for depth datasets
+        Args:
+            config (dict): Config dictionary. Refer to utils/config.py
+            mode (str): "train" or "online_eval"
+            device (str, optional): Device to load the data on. Defaults to 'cpu'.
+            transform (torchvision.transforms, optional): Transform to apply to the data. Defaults to None.
+        """
+        self.config = config
+        if config.dataset == 'ibims':
+            self.data = get_ibims_loader(config, batch_size=1, num_workers=1)
+            return
+        if config.dataset == 'sunrgbd':
+            self.data = get_sunrgbd_loader(
+                data_dir_root=config.sunrgbd_root, batch_size=1, num_workers=1)
+            return
+        if config.dataset == 'diml_indoor':
+            self.data = get_diml_indoor_loader(
+                data_dir_root=config.diml_indoor_root, batch_size=1, num_workers=1)
+            return
+        if config.dataset == 'diml_outdoor':
+            self.data = get_diml_outdoor_loader(
+                data_dir_root=config.diml_outdoor_root, batch_size=1, num_workers=1)
+            return
+        if "diode" in config.dataset:
+            self.data = get_diode_loader(
+                config[config.dataset+"_root"], batch_size=1, num_workers=1)
+            return
+        if config.dataset == 'hypersim_test':
+            self.data = get_hypersim_loader(
+                config.hypersim_test_root, batch_size=1, num_workers=1)
+            return
+        if config.dataset == 'vkitti':
+            self.data = get_vkitti_loader(
+                config.vkitti_root, batch_size=1, num_workers=1)
+            return
+        if config.dataset == 'vkitti2':
+            self.data = get_vkitti2_loader(
+                config.vkitti2_root, batch_size=1, num_workers=1)
+            return
+        if config.dataset == 'ddad':
+            self.data = get_ddad_loader(config.ddad_root, resize_shape=(
+                352, 1216), batch_size=1, num_workers=1)
+            return
+        img_size = self.config.get("img_size", None)
+        img_size = img_size if self.config.get(
+            "do_input_resize", False) else None
+        if transform is None:
+            transform = preprocessing_transforms(mode, size=img_size)
+        if mode == 'train':
+            Dataset = DataLoadPreprocess
+            self.training_samples = Dataset(
+                config, mode, transform=transform, device=device)
+            if config.distributed and not config.debug_mode:
+                self.train_sampler = torch.utils.data.distributed.DistributedSampler(
+                    self.training_samples)
+            else:
+                self.train_sampler = None
+            if not config.debug_mode:
+                self.data = DataLoader(self.training_samples,
+                                    batch_size=config.batch_size,
+                                    shuffle=(self.train_sampler is None),
+                                    num_workers=config.workers,
+                                    pin_memory=True,
+                                    persistent_workers=True,
+                                    #    prefetch_factor=2,
+                                    sampler=self.train_sampler)
+            else:
+                self.data = DataLoader(self.training_samples,
+                                    batch_size=config.batch_size,
+                                    shuffle=(self.train_sampler is None),
+                                    num_workers=0,
+                                    pin_memory=True,
+                                    #    prefetch_factor=2,
+                                    sampler=self.train_sampler)
+        elif mode == 'online_eval':
+            self.testing_samples = DataLoadPreprocess(
+                config, mode, transform=transform)
+            if config.distributed:  # redundant. here only for readability and to be more explicit
+                # Give whole test set to all processes (and report evaluation only on one) regardless
+                self.eval_sampler = None
+            else:
+                self.eval_sampler = None
+            self.data = DataLoader(self.testing_samples, 1,
+                                   shuffle=kwargs.get("shuffle_test", False),
+                                   num_workers=1,
+                                   pin_memory=False,
+                                   sampler=self.eval_sampler)
+        elif mode == 'test':
+            self.testing_samples = DataLoadPreprocess(
+                config, mode, transform=transform)
+            self.data = DataLoader(self.testing_samples,
+                                   1, shuffle=False, num_workers=1)
+        else:
+            print(
+                'mode should be one of \'train, test, online_eval\'. Got {}'.format(mode))
+def repetitive_roundrobin(*iterables):
+    """
+    cycles through iterables but sample wise
+    first yield first sample from first iterable then first sample from second iterable and so on
+    then second sample from first iterable then second sample from second iterable and so on
+    If one iterable is shorter than the others, it is repeated until all iterables are exhausted
+    repetitive_roundrobin('ABC', 'D', 'EF') --> A D E B D F C D E
+    """
+    # Repetitive roundrobin
+    iterables_ = [iter(it) for it in iterables]
+    exhausted = [False] * len(iterables)
+    while not all(exhausted):
+        for i, it in enumerate(iterables_):
+            try:
+                yield next(it)
+            except StopIteration:
+                exhausted[i] = True
+                iterables_[i] = itertools.cycle(iterables[i])
+                # First elements may get repeated if one iterable is shorter than the others
+                yield next(iterables_[i])
+class RepetitiveRoundRobinDataLoader(object):
+    def __init__(self, *dataloaders):
+        self.dataloaders = dataloaders
+    def __iter__(self):
+        return repetitive_roundrobin(*self.dataloaders)
+    def __len__(self):
+        # First samples get repeated, thats why the plus one
+        return len(self.dataloaders) * (max(len(dl) for dl in self.dataloaders) + 1)
+class MixedNYUKITTI(object):
+    def __init__(self, config, mode, device='cpu', **kwargs):
+        config = edict(config)
+        config.workers = config.workers // 2
+        self.config = config
+        nyu_conf = change_dataset(edict(config), 'nyu')
+        kitti_conf = change_dataset(edict(config), 'kitti')
+        # make nyu default for testing
+        self.config = config = nyu_conf
+        img_size = self.config.get("img_size", None)
+        img_size = img_size if self.config.get(
+            "do_input_resize", False) else None
+        if mode == 'train':
+            nyu_loader = DepthDataLoader(
+                nyu_conf, mode, device=device, transform=preprocessing_transforms(mode, size=img_size)).data
+            kitti_loader = DepthDataLoader(
+                kitti_conf, mode, device=device, transform=preprocessing_transforms(mode, size=img_size)).data
+            # It has been changed to repetitive roundrobin
+            self.data = RepetitiveRoundRobinDataLoader(
+                nyu_loader, kitti_loader)
+        else:
+            self.data = DepthDataLoader(nyu_conf, mode, device=device).data
+class MixedNYUPlaces365(object):
+    def __init__(self, config, mode, device='cpu', **kwargs):
+        config = edict(config)
+        config.workers = config.workers // 2
+        self.config = config
+        nyu_conf = change_dataset(edict(config), 'nyu')
+        places365_conf = change_dataset(edict(config), 'places365')
+        # make nyu default for testing
+        self.config = config = nyu_conf
+        img_size = self.config.get("img_size", None)
+        img_size = img_size if self.config.get(
+            "do_input_resize", False) else None
+        if mode == 'train':
+            nyu_loader = DepthDataLoader(
+                nyu_conf, mode, device=device, transform=preprocessing_transforms(mode, size=img_size)).data
+            places365_loader = DepthDataLoader(
+                places365_conf, mode, device=device, transform=preprocessing_transforms(mode, size=img_size)).data
+            # It has been changed to repetitive roundrobin
+            self.data = RepetitiveRoundRobinDataLoader(
+                nyu_loader, places365_loader)
+        else:
+            self.data = DepthDataLoader(nyu_conf, mode, device=device).data
+def remove_leading_slash(s):
+    if s[0] == '/' or s[0] == '\\':
+        return s[1:]
+    return s
+class CachedReader:
+    def __init__(self, shared_dict=None):
+        if shared_dict:
+            self._cache = shared_dict
+        else:
+            self._cache = {}
+    def open(self, fpath):
+        im = self._cache.get(fpath, None)
+        if im is None:
+            im = self._cache[fpath] = Image.open(fpath)
+        return im
+class ImReader:
+    def __init__(self):
+        pass
+    # @cache
+    def open(self, fpath):
+        return Image.open(fpath)
+class DataLoadPreprocess(Dataset):
+    def __init__(self, config, mode, transform=None, is_for_online_eval=False, device="cpu", **kwargs):
+        self.config = config
+        if mode == 'online_eval':
+            with open(config.filenames_file_eval, 'r') as f:
+                self.filenames = f.readlines()
+        else:
+            with open(config.filenames_file, 'r') as f:
+                self.filenames = f.readlines()
+        self.device = torch.device(device)
+        self.mode = mode
+        self.transform = transform
+        self.to_tensor = ToTensor(mode)
+        self.is_for_online_eval = is_for_online_eval
+        if config.use_shared_dict:
+            self.reader = CachedReader(config.shared_dict)
+        else:
+            self.reader = ImReader()
+        if config.dataset == "places365" or config.inpaint_task_probability > 0:
+            places365_conf = change_dataset(edict(config), 'places365')
+            self.places365_data = self.data = Places365(places365_conf.places365_root, places365_conf.places365_depth_root, places365_conf.places365_depth_masks_root, randomize_masks=places365_conf.get("randomize_masks", True), debug_mode=self.config.debug_mode)
+        if config.dataset == "marigold_nyu":
+            self.marigold_data = self.data = MarigoldNYU(config.nyu_dir_root, config.marigold_depth_root, debug_mode=self.config.debug_mode)
+            self.config.avoid_boundary = True
+    def postprocess(self, sample):
+        return sample
+    def __getitem__(self, idx):
+        sample_path = self.filenames[idx] if self.config.dataset not in ('places365', "marigold_nyu") else self.filenames[0]
+        focal = float(sample_path.split()[2])
+        sample = {}
+        if self.mode == 'train':
+            depth_mask = None
+            if self.config.dataset == 'kitti' and self.config.use_right and random.random() > 0.5:
+                image_path = os.path.join(
+                    self.config.data_path, remove_leading_slash(sample_path.split()[3]))
+                depth_path = os.path.join(
+                    self.config.gt_path, remove_leading_slash(sample_path.split()[4]))
+                image = self.reader.open(image_path)
+                depth_gt = self.reader.open(depth_path)
+                w, h = image.size
+            elif self.config.dataset == 'places365':
+                image, depth_gt, depth_mask, image_path, depth_path, _ = self.places365_data[idx]
+                h, w = image.shape[:2]
+                if image.ndim == 2:
+                    image = image.reshape(image.shape[0], image.shape[1], 1)
+                    image = np.repeat(image, 3, axis=-1)
+            elif self.config.dataset == 'marigold_nyu':
+                image, depth_gt, marigold_gt, image_path, depth_path = self.marigold_data[idx]
+                h, w = image.shape[:2]
+                if image.ndim == 2:
+                    image = image.reshape(image.shape[0], image.shape[1], 1)
+                    image = np.repeat(image, 3, axis=-1)
+            else:
+                image_path = os.path.join(
+                    self.config.data_path, remove_leading_slash(sample_path.split()[0]))
+                depth_path = os.path.join(
+                    self.config.gt_path, remove_leading_slash(sample_path.split()[1]))
+                image = self.reader.open(image_path)
+                depth_gt = self.reader.open(depth_path)
+                w, h = image.size
+            if self.config.inpaint_task_probability > 0:
+                _, _, depth_mask, _, _, _ = self.places365_data[idx]
+            if self.config.do_kb_crop:
+                height = image.height
+                width = image.width
+                top_margin = int(height - 352)
+                left_margin = int((width - 1216) / 2)
+                depth_gt = depth_gt.crop(
+                    (left_margin, top_margin, left_margin + 1216, top_margin + 352))
+                image = image.crop(
+                    (left_margin, top_margin, left_margin + 1216, top_margin + 352))
+            # Avoid blank boundaries due to pixel registration?
+            # Train images have white border. Test images have black border.
+            if self.config.dataset in ('nyu', 'marigold_nyu') and self.config.avoid_boundary:
+                # print("Avoiding Blank Boundaries!")
+                # We just crop and pad again with reflect padding to original size
+                # original_size = image.size
+                #crop_params = get_white_border(np.array(255*image, dtype=np.uint8))
+                # crop image down from 640x480 to 624x464
+                crop_params = CropParams(8, 472, 8, 632)
+                image = image[crop_params.top:crop_params.bottom, crop_params.left:crop_params.right]
+                depth_gt = depth_gt[crop_params.top:crop_params.bottom, crop_params.left:crop_params.right]
+                # Use reflect padding to fill the blank
+                #image = np.pad(image, ((crop_params.top, h - crop_params.bottom), (crop_params.left, w - crop_params.right), (0, 0)), mode='reflect')
+                #image = Image.fromarray(image)
+                #depth_gt = np.pad(depth_gt, ((crop_params.top, h - crop_params.bottom), (crop_params.left, w - crop_params.right), (0, 0)), 'constant', constant_values=0)
+                #depth_gt = Image.fromarray(depth_gt)
+                if self.config.dataset == "marigold_nyu":
+                    marigold_gt = marigold_gt[crop_params.top:crop_params.bottom, crop_params.left:crop_params.right]
+            if self.config.do_random_rotate and (self.config.aug) and self.config.dataset not in ('places365', "marigold_nyu"):
+                random_angle = (random.random() - 0.5) * 2 * self.config.degree
+                image = self.rotate_image(image, random_angle)
+                depth_gt = self.rotate_image(
+                    depth_gt, random_angle, flag=Image.NEAREST)
+            if self.config.dataset not in ('places365', "marigold_nyu"):
+                image = np.asarray(image, dtype=np.float32) / 255.0
+                depth_gt = np.asarray(depth_gt, dtype=np.float32)
+                depth_gt = np.expand_dims(depth_gt, axis=2)
+            if self.config.dataset in ('nyu', 'marigold_nyu'):
+                depth_gt = depth_gt / 1000.0
+            elif self.config.dataset != 'places365':
+                depth_gt = depth_gt / 256.0
+            if self.config.aug and (self.config.random_crop) and self.config.dataset not in ('places365', "marigold_nyu"):
+                image, depth_gt = self.random_crop(
+                    image, depth_gt, self.config.input_height, self.config.input_width)
+            if self.config.aug and self.config.random_translate and self.config.dataset not in ('places365', "marigold_nyu"):
+                # print("Random Translation!")
+                image, depth_gt = self.random_translate(image, depth_gt, self.config.max_translation)
+            mask = np.logical_and(depth_gt > self.config.min_depth,
+                                    depth_gt < self.config.max_depth).squeeze()[None, ...]
+            is_inpainting_sample = self.config.inpaint_task_probability > 0 and (torch.rand(1).item() < self.config.inpaint_task_probability)
+            def randomly_scale_depth(depth_to_scale):
+                # scale the mask
+                max_scale_factor = self.config.max_depth / depth_to_scale.max()
+                min_scale_factor = self.config.min_depth / depth_to_scale.min()
+                scale_factor = torch.rand(1).item() * (max_scale_factor - min_scale_factor) + min_scale_factor
+                scaled_depth = depth_to_scale * scale_factor
+                scaled_depth = scaled_depth.clip(self.config.min_depth, self.config.max_depth)
+                return scaled_depth
+            if self.config.dataset in ("marigold_nyu"):
+                marigold_mask = (marigold_gt > -1).squeeze()[None, ...]
+                if is_inpainting_sample and self.config.random_inpainting_scaling:
+                    marigold_gt = randomly_scale_depth(marigold_gt)
+                marigold_gt[~marigold_mask[0]] = 0
+                depth_gt = marigold_gt
+                mask = marigold_mask
+            image, depth_gt, mask = self.train_preprocess(image, depth_gt, mask)
+            sample = {'image': image, 'depth': depth_gt, 'focal': focal,
+                      'mask': mask, **sample}
+            if self.config["depth_channel_mask_augment"]:
+                if self.config.dataset in ("marigold_nyu",):
+                    if (not self.config.inpaint_task_probability > 0) and depth_mask is None:
+                        depth_mask = np.zeros_like(depth_gt)
+                    elif self.config.inpaint_task_probability > 0:
+                        # we randomly mask with places365, or provide no sparse input at all
+                        if is_inpainting_sample:
+                            # upsample depth_mask to match depth_gt
+                            depth_mask = torch.nn.functional.interpolate(torch.from_numpy(depth_mask).permute(2, 0, 1).unsqueeze(0), size=depth_gt.shape[:2], mode='nearest').squeeze(0).permute(1, 2, 0).numpy()
+                        else:
+                            depth_mask = np.zeros_like(depth_gt)
+                    sample["masked_depth"] = depth_gt * depth_mask
+        else:
+            if self.mode == 'online_eval':
+                data_path = self.config.data_path_eval
+            else:
+                data_path = self.config.data_path
+            image_path = os.path.join(
+                data_path, remove_leading_slash(sample_path.split()[0]))
+            image = np.asarray(self.reader.open(image_path),
+                               dtype=np.float32) / 255.0
+            if self.mode == 'online_eval':
+                gt_path = self.config.gt_path_eval
+                depth_path = os.path.join(
+                    gt_path, remove_leading_slash(sample_path.split()[1]))
+                has_valid_depth = False
+                try:
+                    depth_gt = self.reader.open(depth_path)
+                    has_valid_depth = True
+                except IOError:
+                    depth_gt = False
+                    # print('Missing gt for {}'.format(image_path))
+                if has_valid_depth:
+                    depth_gt = np.asarray(depth_gt, dtype=np.float32)
+                    depth_gt = np.expand_dims(depth_gt, axis=2)
+                    if self.config.dataset == 'nyu':
+                        depth_gt = depth_gt / 1000.0
+                    elif self.config.dataset != 'places365':
+                        depth_gt = depth_gt / 256.0
+                    mask = np.logical_and(
+                        depth_gt >= self.config.min_depth, depth_gt <= self.config.max_depth).squeeze()[None, ...]
+                else:
+                    mask = False
+            if self.config.do_kb_crop:
+                height = image.shape[0]
+                width = image.shape[1]
+                top_margin = int(height - 352)
+                left_margin = int((width - 1216) / 2)
+                image = image[top_margin:top_margin + 352,
+                              left_margin:left_margin + 1216, :]
+                if self.mode == 'online_eval' and has_valid_depth:
+                    depth_gt = depth_gt[top_margin:top_margin +
+                                        352, left_margin:left_margin + 1216, :]
+            if self.mode == 'online_eval':
+                sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'has_valid_depth': has_valid_depth,
+                          'image_path': sample_path.split()[0], 'depth_path': sample_path.split()[1],
+                          'mask': mask}
+            else:
+                sample = {'image': image, 'focal': focal}
+        if (self.mode == 'train') or ('has_valid_depth' in sample and sample['has_valid_depth']):
+            if (self.config.dataset not in ('places365', "marigold_nyu")):
+                mask = np.logical_and(depth_gt > self.config.min_depth,
+                                    depth_gt < self.config.max_depth).squeeze()[None, ...]
+            sample['mask'] = mask
+        if self.transform:
+            sample = self.transform(sample)
+        sample = self.postprocess(sample)
+        sample['dataset'] = self.config.dataset
+        if self.config.dataset != 'places365':
+            sample = {**sample, 'image_path': sample_path.split()[0], 'depth_path': sample_path.split()[1]}
+        else:
+            sample = {**sample, 'image_path': image_path, 'depth_path': depth_path}
+        return sample
+    def rotate_image(self, image, angle, flag=Image.BILINEAR):
+        result = image.rotate(angle, resample=flag)
+        return result
+    def random_crop(self, img, depth, height, width):
+        assert img.shape[0] >= height
+        assert img.shape[1] >= width
+        assert img.shape[0] == depth.shape[0]
+        assert img.shape[1] == depth.shape[1]
+        x = random.randint(0, img.shape[1] - width)
+        y = random.randint(0, img.shape[0] - height)
+        img = img[y:y + height, x:x + width, :]
+        depth = depth[y:y + height, x:x + width, :]
+        return img, depth
+    def random_translate(self, img, depth, max_t=20):
+        assert img.shape[0] == depth.shape[0]
+        assert img.shape[1] == depth.shape[1]
+        p = self.config.translate_prob
+        do_translate = random.random()
+        if do_translate > p:
+            return img, depth
+        x = random.randint(-max_t, max_t)
+        y = random.randint(-max_t, max_t)
+        M = np.float32([[1, 0, x], [0, 1, y]])
+        # print(img.shape, depth.shape)
+        img = cv2.warpAffine(img, M, (img.shape[1], img.shape[0]))
+        depth = cv2.warpAffine(depth, M, (depth.shape[1], depth.shape[0]))
+        depth = depth.squeeze()[..., None]  # add channel dim back. Affine warp removes it
+        # print("after", img.shape, depth.shape)
+        return img, depth
+    def train_preprocess(self, image, depth_gt, mask):
+        if self.config.aug:
+            # Random flipping
+            do_flip = random.random()
+            if do_flip > 0.5:
+                # image is H x W x 3
+                image = (image[:, ::-1, :]).copy()
+                # depth_gt is H x W x 1
+                depth_gt = (depth_gt[:, ::-1, :]).copy()
+                # mask is B x H x W
+                mask = (mask[:, :, ::-1]).copy()
+            # Random gamma, brightness, color augmentation
+            do_augment = random.random()
+            if do_augment > 0.5:
+                image = self.augment_image(image)
+        return image, depth_gt, mask
+    def augment_image(self, image):
+        # gamma augmentation
+        gamma = random.uniform(0.9, 1.1)
+        image_aug = image ** gamma
+        # brightness augmentation
+        if self.config.dataset == 'nyu':
+            brightness = random.uniform(0.75, 1.25)
+        else:
+            brightness = random.uniform(0.9, 1.1)
+        image_aug = image_aug * brightness
+        # color augmentation
+        colors = np.random.uniform(0.9, 1.1, size=3)
+        white = np.ones((image.shape[0], image.shape[1]))
+        color_image = np.stack([white * colors[i] for i in range(3)], axis=2)
+        image_aug *= color_image
+        image_aug = np.clip(image_aug, 0, 1)
+        return image_aug
+    def __len__(self):
+        return len(self.data) if (self.config.dataset in ('places365', "marigold_nyu") and self.mode != 'online_eval') else len(self.filenames)
+class ToTensor(object):
+    def __init__(self, mode, do_normalize=False, size=None):
+        self.mode = mode
+        self.normalize = transforms.Normalize(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if do_normalize else nn.Identity()
+        self.size = size
+        if size is not None:
+            self.resize = transforms.Resize(size=size)
+        else:
+            self.resize = nn.Identity()
+    def __call__(self, sample):
+        image, focal = sample['image'], sample['focal']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        image = self.resize(image)
+        if self.mode == 'test':
+            return {'image': image, 'focal': focal}
+        depth = sample['depth']
+        if self.mode == 'train':
+            depth = self.to_tensor(depth)
+            return {**sample, 'image': image, 'depth': depth, 'focal': focal}
+        else:
+            has_valid_depth = sample['has_valid_depth']
+            image = self.resize(image)
+            return {**sample, 'image': image, 'depth': depth, 'focal': focal, 'has_valid_depth': has_valid_depth,
+                    'image_path': sample['image_path'], 'depth_path': sample['depth_path']}
+    def to_tensor(self, pic):
+        if not (_is_pil_image(pic) or _is_numpy_image(pic)):
+            raise TypeError(
+                'pic should be PIL Image or ndarray. Got {}'.format(type(pic)))
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img

zoedepth/data/ddad.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import os
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class ToTensor(object):
+    def __init__(self, resize_shape):
+        # self.normalize = transforms.Normalize(
+        #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        self.normalize = lambda x : x
+        self.resize = transforms.Resize(resize_shape)
+    def __call__(self, sample):
+        image, depth = sample['image'], sample['depth']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        depth = self.to_tensor(depth)
+        image = self.resize(image)
+        return {'image': image, 'depth': depth, 'dataset': "ddad"}
+    def to_tensor(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        #         # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class DDAD(Dataset):
+    def __init__(self, data_dir_root, resize_shape):
+        import glob
+        # image paths are of the form <data_dir_root>/{outleft, depthmap}/*.png
+        self.image_files = glob.glob(os.path.join(data_dir_root, '*.png'))
+        self.depth_files = [r.replace("_rgb.png", "_depth.npy")
+                            for r in self.image_files]
+        self.transform = ToTensor(resize_shape)
+    def __getitem__(self, idx):
+        image_path = self.image_files[idx]
+        depth_path = self.depth_files[idx]
+        image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+        depth = np.load(depth_path)  # meters
+        # depth[depth > 8] = -1
+        depth = depth[..., None]
+        sample = dict(image=image, depth=depth)
+        sample = self.transform(sample)
+        if idx == 0:
+            print(sample["image"].shape)
+        return sample
+    def __len__(self):
+        return len(self.image_files)
+def get_ddad_loader(data_dir_root, resize_shape, batch_size=1, **kwargs):
+    dataset = DDAD(data_dir_root, resize_shape)
+    return DataLoader(dataset, batch_size, **kwargs)

zoedepth/data/diml_indoor_test.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import os
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class ToTensor(object):
+    def __init__(self):
+        # self.normalize = transforms.Normalize(
+        #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        self.normalize = lambda x : x
+        self.resize = transforms.Resize((480, 640))
+    def __call__(self, sample):
+        image, depth = sample['image'], sample['depth']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        depth = self.to_tensor(depth)
+        image = self.resize(image)
+        return {'image': image, 'depth': depth, 'dataset': "diml_indoor"}
+    def to_tensor(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        #         # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class DIML_Indoor(Dataset):
+    def __init__(self, data_dir_root):
+        import glob
+        # image paths are of the form <data_dir_root>/{HR, LR}/<scene>/{color, depth_filled}/*.png
+        self.image_files = glob.glob(os.path.join(
+            data_dir_root, "LR", '*', 'color', '*.png'))
+        self.depth_files = [r.replace("color", "depth_filled").replace(
+            "_c.png", "_depth_filled.png") for r in self.image_files]
+        self.transform = ToTensor()
+    def __getitem__(self, idx):
+        image_path = self.image_files[idx]
+        depth_path = self.depth_files[idx]
+        image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+        depth = np.asarray(Image.open(depth_path),
+                           dtype='uint16') / 1000.0  # mm to meters
+        # print(np.shape(image))
+        # print(np.shape(depth))
+        # depth[depth > 8] = -1
+        depth = depth[..., None]
+        sample = dict(image=image, depth=depth)
+        # return sample
+        sample = self.transform(sample)
+        if idx == 0:
+            print(sample["image"].shape)
+        return sample
+    def __len__(self):
+        return len(self.image_files)
+def get_diml_indoor_loader(data_dir_root, batch_size=1, **kwargs):
+    dataset = DIML_Indoor(data_dir_root)
+    return DataLoader(dataset, batch_size, **kwargs)
+# get_diml_indoor_loader(data_dir_root="datasets/diml/indoor/test/HR")
+# get_diml_indoor_loader(data_dir_root="datasets/diml/indoor/test/LR")

zoedepth/data/diml_outdoor_test.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import os
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class ToTensor(object):
+    def __init__(self):
+        # self.normalize = transforms.Normalize(
+        #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        self.normalize = lambda x : x
+    def __call__(self, sample):
+        image, depth = sample['image'], sample['depth']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        depth = self.to_tensor(depth)
+        return {'image': image, 'depth': depth, 'dataset': "diml_outdoor"}
+    def to_tensor(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        #         # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class DIML_Outdoor(Dataset):
+    def __init__(self, data_dir_root):
+        import glob
+        # image paths are of the form <data_dir_root>/{outleft, depthmap}/*.png
+        self.image_files = glob.glob(os.path.join(
+            data_dir_root, "*", 'outleft', '*.png'))
+        self.depth_files = [r.replace("outleft", "depthmap")
+                            for r in self.image_files]
+        self.transform = ToTensor()
+    def __getitem__(self, idx):
+        image_path = self.image_files[idx]
+        depth_path = self.depth_files[idx]
+        image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+        depth = np.asarray(Image.open(depth_path),
+                           dtype='uint16') / 1000.0  # mm to meters
+        # depth[depth > 8] = -1
+        depth = depth[..., None]
+        sample = dict(image=image, depth=depth, dataset="diml_outdoor")
+        # return sample
+        return self.transform(sample)
+    def __len__(self):
+        return len(self.image_files)
+def get_diml_outdoor_loader(data_dir_root, batch_size=1, **kwargs):
+    dataset = DIML_Outdoor(data_dir_root)
+    return DataLoader(dataset, batch_size, **kwargs)
+# get_diml_outdoor_loader(data_dir_root="datasets/diml/outdoor/test/HR")
+# get_diml_outdoor_loader(data_dir_root="datasets/diml/outdoor/test/LR")

zoedepth/data/diode.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import os
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class ToTensor(object):
+    def __init__(self):
+        # self.normalize = transforms.Normalize(
+        #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        self.normalize = lambda x : x
+        self.resize = transforms.Resize(480)
+    def __call__(self, sample):
+        image, depth = sample['image'], sample['depth']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        depth = self.to_tensor(depth)
+        image = self.resize(image)
+        return {'image': image, 'depth': depth, 'dataset': "diode"}
+    def to_tensor(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        #         # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class DIODE(Dataset):
+    def __init__(self, data_dir_root):
+        import glob
+        # image paths are of the form <data_dir_root>/scene_#/scan_#/*.png
+        self.image_files = glob.glob(
+            os.path.join(data_dir_root, '*', '*', '*.png'))
+        self.depth_files = [r.replace(".png", "_depth.npy")
+                            for r in self.image_files]
+        self.depth_mask_files = [
+            r.replace(".png", "_depth_mask.npy") for r in self.image_files]
+        self.transform = ToTensor()
+    def __getitem__(self, idx):
+        image_path = self.image_files[idx]
+        depth_path = self.depth_files[idx]
+        depth_mask_path = self.depth_mask_files[idx]
+        image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+        depth = np.load(depth_path)  # in meters
+        valid = np.load(depth_mask_path)  # binary
+        # depth[depth > 8] = -1
+        # depth = depth[..., None]
+        sample = dict(image=image, depth=depth, valid=valid)
+        # return sample
+        sample = self.transform(sample)
+        if idx == 0:
+            print(sample["image"].shape)
+        return sample
+    def __len__(self):
+        return len(self.image_files)
+def get_diode_loader(data_dir_root, batch_size=1, **kwargs):
+    dataset = DIODE(data_dir_root)
+    return DataLoader(dataset, batch_size, **kwargs)
+# get_diode_loader(data_dir_root="datasets/diode/val/outdoor")

zoedepth/data/hypersim.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import glob
+import os
+import h5py
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+def hypersim_distance_to_depth(npyDistance):
+    intWidth, intHeight, fltFocal = 1024, 768, 886.81
+    npyImageplaneX = np.linspace((-0.5 * intWidth) + 0.5, (0.5 * intWidth) - 0.5, intWidth).reshape(
+        1, intWidth).repeat(intHeight, 0).astype(np.float32)[:, :, None]
+    npyImageplaneY = np.linspace((-0.5 * intHeight) + 0.5, (0.5 * intHeight) - 0.5,
+                                 intHeight).reshape(intHeight, 1).repeat(intWidth, 1).astype(np.float32)[:, :, None]
+    npyImageplaneZ = np.full([intHeight, intWidth, 1], fltFocal, np.float32)
+    npyImageplane = np.concatenate(
+        [npyImageplaneX, npyImageplaneY, npyImageplaneZ], 2)
+    npyDepth = npyDistance / np.linalg.norm(npyImageplane, 2, 2) * fltFocal
+    return npyDepth
+class ToTensor(object):
+    def __init__(self):
+        # self.normalize = transforms.Normalize(
+        #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        self.normalize = lambda x: x
+        self.resize = transforms.Resize((480, 640))
+    def __call__(self, sample):
+        image, depth = sample['image'], sample['depth']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        depth = self.to_tensor(depth)
+        image = self.resize(image)
+        return {'image': image, 'depth': depth, 'dataset': "hypersim"}
+    def to_tensor(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        #         # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class HyperSim(Dataset):
+    def __init__(self, data_dir_root):
+        # image paths are of the form <data_dir_root>/<scene>/images/scene_cam_#_final_preview/*.tonemap.jpg
+        # depth paths are of the form <data_dir_root>/<scene>/images/scene_cam_#_final_preview/*.depth_meters.hdf5
+        self.image_files = glob.glob(os.path.join(
+            data_dir_root, '*', 'images', 'scene_cam_*_final_preview', '*.tonemap.jpg'))
+        self.depth_files = [r.replace("_final_preview", "_geometry_hdf5").replace(
+            ".tonemap.jpg", ".depth_meters.hdf5") for r in self.image_files]
+        self.transform = ToTensor()
+    def __getitem__(self, idx):
+        image_path = self.image_files[idx]
+        depth_path = self.depth_files[idx]
+        image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+        # depth from hdf5
+        depth_fd = h5py.File(depth_path, "r")
+        # in meters (Euclidean distance)
+        distance_meters = np.array(depth_fd['dataset'])
+        depth = hypersim_distance_to_depth(
+            distance_meters)  # in meters (planar depth)
+        # depth[depth > 8] = -1
+        depth = depth[..., None]
+        sample = dict(image=image, depth=depth)
+        sample = self.transform(sample)
+        if idx == 0:
+            print(sample["image"].shape)
+        return sample
+    def __len__(self):
+        return len(self.image_files)
+def get_hypersim_loader(data_dir_root, batch_size=1, **kwargs):
+    dataset = HyperSim(data_dir_root)
+    return DataLoader(dataset, batch_size, **kwargs)

zoedepth/data/ibims.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import os
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms as T
+class iBims(Dataset):
+    def __init__(self, config):
+        root_folder = config.ibims_root
+        with open(os.path.join(root_folder, "imagelist.txt"), 'r') as f:
+            imglist = f.read().split()
+        samples = []
+        for basename in imglist:
+            img_path = os.path.join(root_folder, 'rgb', basename + ".png")
+            depth_path = os.path.join(root_folder, 'depth', basename + ".png")
+            valid_mask_path = os.path.join(
+                root_folder, 'mask_invalid', basename+".png")
+            transp_mask_path = os.path.join(
+                root_folder, 'mask_transp', basename+".png")
+            samples.append(
+                (img_path, depth_path, valid_mask_path, transp_mask_path))
+        self.samples = samples
+        # self.normalize = T.Normalize(
+        #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        self.normalize = lambda x : x
+    def __getitem__(self, idx):
+        img_path, depth_path, valid_mask_path, transp_mask_path = self.samples[idx]
+        img = np.asarray(Image.open(img_path), dtype=np.float32) / 255.0
+        depth = np.asarray(Image.open(depth_path),
+                           dtype=np.uint16).astype('float')*50.0/65535
+        mask_valid = np.asarray(Image.open(valid_mask_path))
+        mask_transp = np.asarray(Image.open(transp_mask_path))
+        # depth = depth * mask_valid * mask_transp
+        depth = np.where(mask_valid * mask_transp, depth, -1)
+        img = torch.from_numpy(img).permute(2, 0, 1)
+        img = self.normalize(img)
+        depth = torch.from_numpy(depth).unsqueeze(0)
+        return dict(image=img, depth=depth, image_path=img_path, depth_path=depth_path, dataset='ibims')
+    def __len__(self):
+        return len(self.samples)
+def get_ibims_loader(config, batch_size=1, **kwargs):
+    dataloader = DataLoader(iBims(config), batch_size=batch_size, **kwargs)
+    return dataloader

zoedepth/data/marigold_nyu.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import os
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from random import choice
+class ToTensor(object):
+    def __init__(self):
+        self.normalize = transforms.Normalize(
+             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        #self.normalize = lambda x : x
+    def __call__(self, sample):
+        image, depth = sample['image'], sample['depth']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        depth = self.to_tensor(depth)
+        return {'image': image, 'depth': depth, 'dataset': "marigold_nyu"}
+    def to_tensor(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        #         # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class MarigoldNYU(Dataset):
+    def __init__(self, nyu_dir_root, marigold_depth_root, debug_mode=False):
+        import glob
+        import os
+        import itertools
+        categories = os.listdir(os.path.join(nyu_dir_root))
+        if debug_mode:
+            categories = categories[:2]
+        self.image_files = list(itertools.chain(*[glob.glob(os.path.join(nyu_dir_root, c, "rgb_*.jpg")) for c in categories]))
+        self.nyu_depth_files = [os.path.join(nyu_dir_root, os.path.join(*r.split("/")[-2:])).replace("jpg", "png").replace("rgb", "sync_depth") for r in self.image_files]
+        self.marigold_depth_files = [os.path.join(marigold_depth_root, os.path.join(*r.split("/")[-2:])).replace("jpg", "npy") for r in self.image_files]
+        self.transform = ToTensor()
+    def __getitem__(self, idx):
+        image_path = self.image_files[idx]
+        nyu_depth_path = self.nyu_depth_files[idx]
+        marigold_depth_path = self.marigold_depth_files[idx]
+        image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+        nyu_depth = np.asarray(Image.open(nyu_depth_path), dtype=np.float32)
+        marigold_depth = np.load(marigold_depth_path)
+        return image, nyu_depth[..., np.newaxis], marigold_depth[..., np.newaxis], image_path, nyu_depth_path
+    def __len__(self):
+        return len(self.image_files)
+def get_marigold_nyu_loader(nyu_dir_root, marigold_depth_root, batch_size=1, **kwargs):
+    dataset = MarigoldNYU(nyu_dir_root, marigold_depth_root)
+    return DataLoader(dataset, batch_size, **kwargs)

zoedepth/data/places365.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import os
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from random import choice
+class ToTensor(object):
+    def __init__(self):
+        self.normalize = transforms.Normalize(
+             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        #self.normalize = lambda x : x
+    def __call__(self, sample):
+        image, depth = sample['image'], sample['depth']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        depth = self.to_tensor(depth)
+        return {'image': image, 'depth': depth, 'dataset': "places365"}
+    def to_tensor(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        #         # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class Places365(Dataset):
+    def __init__(self, data_dir_root, depth_dir_root, depth_masks_dir_root, randomize_masks=True, debug_mode=False):
+        import glob
+        import os
+        import itertools
+        categories = os.listdir(os.path.join(data_dir_root))
+        if debug_mode:
+            categories = categories[:2]
+        self.image_files = list(itertools.chain(*[glob.glob(os.path.join(data_dir_root, c, "*.jpg")) for c in categories]))
+        self.depth_files = [os.path.join(depth_dir_root, os.path.join(*r.split("/")[-2:])).replace("jpg", "npy") for r in self.image_files]
+        self.depth_masks_files = [os.path.join(depth_masks_dir_root, os.path.join(*r.split("/")[-2:])).replace("jpg", "npy") for r in self.image_files]
+        self.randomize_masks = randomize_masks
+        self.transform = ToTensor()
+    def __getitem__(self, idx):
+        image_path = self.image_files[idx]
+        depth_path = self.depth_files[idx]
+        if not self.randomize_masks:
+            depth_masks_path = self.depth_masks_files[idx]
+        else:
+            depth_masks_path = choice(self.depth_masks_files)
+        image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+        depth = np.load(depth_path)
+        depth_mask = 1 - np.load(depth_masks_path)
+        return image, depth[..., np.newaxis], depth_mask[..., np.newaxis], image_path, depth_path, depth_masks_path
+    def __len__(self):
+        return len(self.image_files)
+def get_places365_loader(data_dir_root, depth_dir_root, depth_masks_dir_root, batch_size=1, **kwargs):
+    dataset = Places365(data_dir_root, depth_dir_root, depth_masks_dir_root)
+    return DataLoader(dataset, batch_size, **kwargs)

zoedepth/data/preprocess.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import numpy as np
+from dataclasses import dataclass
+from typing import Tuple, List
+# dataclass to store the crop parameters
+@dataclass
+class CropParams:
+    top: int
+    bottom: int
+    left: int
+    right: int
+def get_border_params(rgb_image, tolerance=0.1, cut_off=20, value=0, level_diff_threshold=5, channel_axis=-1, min_border=5) -> CropParams:
+    gray_image = np.mean(rgb_image, axis=channel_axis)
+    h, w = gray_image.shape
+    def num_value_pixels(arr):
+        return np.sum(np.abs(arr - value) < level_diff_threshold)
+    def is_above_tolerance(arr, total_pixels):
+        return (num_value_pixels(arr) / total_pixels) > tolerance
+    # Crop top border until number of value pixels become below tolerance
+    top = min_border
+    while is_above_tolerance(gray_image[top, :], w) and top < h-1:
+        top += 1
+        if top > cut_off:
+            break
+    # Crop bottom border until number of value pixels become below tolerance
+    bottom = h - min_border
+    while is_above_tolerance(gray_image[bottom, :], w) and bottom > 0:
+        bottom -= 1
+        if h - bottom > cut_off:
+            break
+    # Crop left border until number of value pixels become below tolerance
+    left = min_border
+    while is_above_tolerance(gray_image[:, left], h) and left < w-1:
+        left += 1
+        if left > cut_off:
+            break
+    # Crop right border until number of value pixels become below tolerance
+    right = w - min_border
+    while is_above_tolerance(gray_image[:, right], h) and right > 0:
+        right -= 1
+        if w - right > cut_off:
+            break
+    return CropParams(top, bottom, left, right)
+def get_white_border(rgb_image, value=255, **kwargs) -> CropParams:
+    """Crops the white border of the RGB.
+    Args:
+        rgb: RGB image, shape (H, W, 3).
+    Returns:
+        Crop parameters.
+    """
+    if value == 255:
+        # assert range of values in rgb image is [0, 255]
+        assert np.max(rgb_image) <= 255 and np.min(rgb_image) >= 0, "RGB image values are not in range [0, 255]."
+        assert rgb_image.max() > 1, "RGB image values are not in range [0, 255]."
+    elif value == 1:
+        # assert range of values in rgb image is [0, 1]
+        assert np.max(rgb_image) <= 1 and np.min(rgb_image) >= 0, "RGB image values are not in range [0, 1]."
+    return get_border_params(rgb_image, value=value, **kwargs)
+def get_black_border(rgb_image, **kwargs) -> CropParams:
+    """Crops the black border of the RGB.
+    Args:
+        rgb: RGB image, shape (H, W, 3).
+    Returns:
+        Crop parameters.
+    """
+    return get_border_params(rgb_image, value=0, **kwargs)
+def crop_image(image: np.ndarray, crop_params: CropParams) -> np.ndarray:
+    """Crops the image according to the crop parameters.
+    Args:
+        image: RGB or depth image, shape (H, W, 3) or (H, W).
+        crop_params: Crop parameters.
+    Returns:
+        Cropped image.
+    """
+    return image[crop_params.top:crop_params.bottom, crop_params.left:crop_params.right]
+def crop_images(*images: np.ndarray, crop_params: CropParams) -> Tuple[np.ndarray]:
+    """Crops the images according to the crop parameters.
+    Args:
+        images: RGB or depth images, shape (H, W, 3) or (H, W).
+        crop_params: Crop parameters.
+    Returns:
+        Cropped images.
+    """
+    return tuple(crop_image(image, crop_params) for image in images)
+def crop_black_or_white_border(rgb_image, *other_images: np.ndarray, tolerance=0.1, cut_off=20, level_diff_threshold=5) -> Tuple[np.ndarray]:
+    """Crops the white and black border of the RGB and depth images.
+    Args:
+        rgb: RGB image, shape (H, W, 3). This image is used to determine the border.
+        other_images: The other images to crop according to the border of the RGB image.
+    Returns:
+        Cropped RGB and other images.
+    """
+    # crop black border
+    crop_params = get_black_border(rgb_image, tolerance=tolerance, cut_off=cut_off, level_diff_threshold=level_diff_threshold)
+    cropped_images = crop_images(rgb_image, *other_images, crop_params=crop_params)
+    # crop white border
+    crop_params = get_white_border(cropped_images[0], tolerance=tolerance, cut_off=cut_off, level_diff_threshold=level_diff_threshold)
+    cropped_images = crop_images(*cropped_images, crop_params=crop_params)
+    return cropped_images

zoedepth/data/sun_rgbd_loader.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import os
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class ToTensor(object):
+    def __init__(self):
+        # self.normalize = transforms.Normalize(
+        #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        self.normalize = lambda x : x
+    def __call__(self, sample):
+        image, depth = sample['image'], sample['depth']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        depth = self.to_tensor(depth)
+        return {'image': image, 'depth': depth, 'dataset': "sunrgbd"}
+    def to_tensor(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        #         # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class SunRGBD(Dataset):
+    def __init__(self, data_dir_root):
+        # test_file_dirs = loadmat(train_test_file)['alltest'].squeeze()
+        # all_test = [t[0].replace("/n/fs/sun3d/data/", "") for t in test_file_dirs]
+        # self.all_test = [os.path.join(data_dir_root, t) for t in all_test]
+        import glob
+        self.image_files = glob.glob(
+            os.path.join(data_dir_root, 'rgb', 'rgb', '*'))
+        self.depth_files = [
+            r.replace("rgb/rgb", "gt/gt").replace("jpg", "png") for r in self.image_files]
+        self.transform = ToTensor()
+    def __getitem__(self, idx):
+        image_path = self.image_files[idx]
+        depth_path = self.depth_files[idx]
+        image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0
+        depth = np.asarray(Image.open(depth_path), dtype='uint16') / 1000.0
+        depth[depth > 8] = -1
+        depth = depth[..., None]
+        return self.transform(dict(image=image, depth=depth))
+    def __len__(self):
+        return len(self.image_files)
+def get_sunrgbd_loader(data_dir_root, batch_size=1, **kwargs):
+    dataset = SunRGBD(data_dir_root)
+    return DataLoader(dataset, batch_size, **kwargs)

zoedepth/data/transforms.py ADDED Viewed

	@@ -0,0 +1,481 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import math
+import random
+import cv2
+import numpy as np
+class RandomFliplr(object):
+    """Horizontal flip of the sample with given probability.
+    """
+    def __init__(self, probability=0.5):
+        """Init.
+        Args:
+            probability (float, optional): Flip probability. Defaults to 0.5.
+        """
+        self.__probability = probability
+    def __call__(self, sample):
+        prob = random.random()
+        if prob < self.__probability:
+            for k, v in sample.items():
+                if len(v.shape) >= 2:
+                    sample[k] = np.fliplr(v).copy()
+        return sample
+def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
+    """Rezise the sample to ensure the given size. Keeps aspect ratio.
+    Args:
+        sample (dict): sample
+        size (tuple): image size
+    Returns:
+        tuple: new size
+    """
+    shape = list(sample["disparity"].shape)
+    if shape[0] >= size[0] and shape[1] >= size[1]:
+        return sample
+    scale = [0, 0]
+    scale[0] = size[0] / shape[0]
+    scale[1] = size[1] / shape[1]
+    scale = max(scale)
+    shape[0] = math.ceil(scale * shape[0])
+    shape[1] = math.ceil(scale * shape[1])
+    # resize
+    sample["image"] = cv2.resize(
+        sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
+    )
+    sample["disparity"] = cv2.resize(
+        sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
+    )
+    sample["mask"] = cv2.resize(
+        sample["mask"].astype(np.float32),
+        tuple(shape[::-1]),
+        interpolation=cv2.INTER_NEAREST,
+    )
+    sample["mask"] = sample["mask"].astype(bool)
+    return tuple(shape)
+class RandomCrop(object):
+    """Get a random crop of the sample with the given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_if_needed=False,
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): output width
+            height (int): output height
+            resize_if_needed (bool, optional): If True, sample might be upsampled to ensure
+                that a crop of size (width, height) is possbile. Defaults to False.
+        """
+        self.__size = (height, width)
+        self.__resize_if_needed = resize_if_needed
+        self.__image_interpolation_method = image_interpolation_method
+    def __call__(self, sample):
+        shape = sample["disparity"].shape
+        if self.__size[0] > shape[0] or self.__size[1] > shape[1]:
+            if self.__resize_if_needed:
+                shape = apply_min_size(
+                    sample, self.__size, self.__image_interpolation_method
+                )
+            else:
+                raise Exception(
+                    "Output size {} bigger than input size {}.".format(
+                        self.__size, shape
+                    )
+                )
+        offset = (
+            np.random.randint(shape[0] - self.__size[0] + 1),
+            np.random.randint(shape[1] - self.__size[1] + 1),
+        )
+        for k, v in sample.items():
+            if k == "code" or k == "basis":
+                continue
+            if len(sample[k].shape) >= 2:
+                sample[k] = v[
+                    offset[0]: offset[0] + self.__size[0],
+                    offset[1]: offset[1] + self.__size[1],
+                ]
+        return sample
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+        letter_box=False,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+        self.__letter_box = letter_box
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of)
+                 * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of)
+                 * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(
+                    f"resize_method {self.__resize_method} not implemented"
+                )
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, min_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, min_val=self.__width
+            )
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, max_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, max_val=self.__width
+            )
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(
+                f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def make_letter_box(self, sample):
+        top = bottom = (self.__height - sample.shape[0]) // 2
+        left = right = (self.__width - sample.shape[1]) // 2
+        sample = cv2.copyMakeBorder(
+            sample, top, bottom, left, right, cv2.BORDER_CONSTANT, None, 0)
+        return sample
+    def __call__(self, sample):
+        width, height = self.get_size(
+            sample["image"].shape[1], sample["image"].shape[0]
+        )
+        # resize sample
+        sample["image"] = cv2.resize(
+            sample["image"],
+            (width, height),
+            interpolation=self.__image_interpolation_method,
+        )
+        if self.__letter_box:
+            sample["image"] = self.make_letter_box(sample["image"])
+        if self.__resize_target:
+            if "disparity" in sample:
+                sample["disparity"] = cv2.resize(
+                    sample["disparity"],
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+                if self.__letter_box:
+                    sample["disparity"] = self.make_letter_box(
+                        sample["disparity"])
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(
+                    sample["depth"], (width,
+                                      height), interpolation=cv2.INTER_NEAREST
+                )
+                if self.__letter_box:
+                    sample["depth"] = self.make_letter_box(sample["depth"])
+            sample["mask"] = cv2.resize(
+                sample["mask"].astype(np.float32),
+                (width, height),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            if self.__letter_box:
+                sample["mask"] = self.make_letter_box(sample["mask"])
+            sample["mask"] = sample["mask"].astype(bool)
+        return sample
+class ResizeFixed(object):
+    def __init__(self, size):
+        self.__size = size
+    def __call__(self, sample):
+        sample["image"] = cv2.resize(
+            sample["image"], self.__size[::-1], interpolation=cv2.INTER_LINEAR
+        )
+        sample["disparity"] = cv2.resize(
+            sample["disparity"], self.__size[::-
+                                             1], interpolation=cv2.INTER_NEAREST
+        )
+        sample["mask"] = cv2.resize(
+            sample["mask"].astype(np.float32),
+            self.__size[::-1],
+            interpolation=cv2.INTER_NEAREST,
+        )
+        sample["mask"] = sample["mask"].astype(bool)
+        return sample
+class Rescale(object):
+    """Rescale target values to the interval [0, max_val].
+    If input is constant, values are set to max_val / 2.
+    """
+    def __init__(self, max_val=1.0, use_mask=True):
+        """Init.
+        Args:
+            max_val (float, optional): Max output value. Defaults to 1.0.
+            use_mask (bool, optional): Only operate on valid pixels (mask == True). Defaults to True.
+        """
+        self.__max_val = max_val
+        self.__use_mask = use_mask
+    def __call__(self, sample):
+        disp = sample["disparity"]
+        if self.__use_mask:
+            mask = sample["mask"]
+        else:
+            mask = np.ones_like(disp, dtype=np.bool)
+        if np.sum(mask) == 0:
+            return sample
+        min_val = np.min(disp[mask])
+        max_val = np.max(disp[mask])
+        if max_val > min_val:
+            sample["disparity"][mask] = (
+                (disp[mask] - min_val) / (max_val - min_val) * self.__max_val
+            )
+        else:
+            sample["disparity"][mask] = np.ones_like(
+                disp[mask]) * self.__max_val / 2.0
+        return sample
+# mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class DepthToDisparity(object):
+    """Convert depth to disparity. Removes depth from sample.
+    """
+    def __init__(self, eps=1e-4):
+        self.__eps = eps
+    def __call__(self, sample):
+        assert "depth" in sample
+        sample["mask"][sample["depth"] < self.__eps] = False
+        sample["disparity"] = np.zeros_like(sample["depth"])
+        sample["disparity"][sample["depth"] >= self.__eps] = (
+            1.0 / sample["depth"][sample["depth"] >= self.__eps]
+        )
+        del sample["depth"]
+        return sample
+class DisparityToDepth(object):
+    """Convert disparity to depth. Removes disparity from sample.
+    """
+    def __init__(self, eps=1e-4):
+        self.__eps = eps
+    def __call__(self, sample):
+        assert "disparity" in sample
+        disp = np.abs(sample["disparity"])
+        sample["mask"][disp < self.__eps] = False
+        # print(sample["disparity"])
+        # print(sample["mask"].sum())
+        # exit()
+        sample["depth"] = np.zeros_like(disp)
+        sample["depth"][disp >= self.__eps] = (
+            1.0 / disp[disp >= self.__eps]
+        )
+        del sample["disparity"]
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        if "disparity" in sample:
+            disparity = sample["disparity"].astype(np.float32)
+            sample["disparity"] = np.ascontiguousarray(disparity)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        return sample

zoedepth/data/vkitti.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+import os
+from PIL import Image
+import numpy as np
+import cv2
+class ToTensor(object):
+    def __init__(self):
+        self.normalize = transforms.Normalize(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        # self.resize = transforms.Resize((375, 1242))
+    def __call__(self, sample):
+        image, depth = sample['image'], sample['depth']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        depth = self.to_tensor(depth)
+        # image = self.resize(image)
+        return {'image': image, 'depth': depth, 'dataset': "vkitti"}
+    def to_tensor(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        #         # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class VKITTI(Dataset):
+    def __init__(self, data_dir_root, do_kb_crop=True):
+        import glob
+        # image paths are of the form <data_dir_root>/{HR, LR}/<scene>/{color, depth_filled}/*.png
+        self.image_files = glob.glob(os.path.join(
+            data_dir_root, "test_color", '*.png'))
+        self.depth_files = [r.replace("test_color", "test_depth")
+                            for r in self.image_files]
+        self.do_kb_crop = True
+        self.transform = ToTensor()
+    def __getitem__(self, idx):
+        image_path = self.image_files[idx]
+        depth_path = self.depth_files[idx]
+        image = Image.open(image_path)
+        depth = Image.open(depth_path)
+        depth = cv2.imread(depth_path, cv2.IMREAD_ANYCOLOR |
+                           cv2.IMREAD_ANYDEPTH)
+        print("dpeth min max", depth.min(), depth.max())
+        # print(np.shape(image))
+        # print(np.shape(depth))
+        # depth[depth > 8] = -1
+        if self.do_kb_crop and False:
+            height = image.height
+            width = image.width
+            top_margin = int(height - 352)
+            left_margin = int((width - 1216) / 2)
+            depth = depth.crop(
+                (left_margin, top_margin, left_margin + 1216, top_margin + 352))
+            image = image.crop(
+                (left_margin, top_margin, left_margin + 1216, top_margin + 352))
+            # uv = uv[:, top_margin:top_margin + 352, left_margin:left_margin + 1216]
+        image = np.asarray(image, dtype=np.float32) / 255.0
+        # depth = np.asarray(depth, dtype=np.uint16) /1.
+        depth = depth[..., None]
+        sample = dict(image=image, depth=depth)
+        # return sample
+        sample = self.transform(sample)
+        if idx == 0:
+            print(sample["image"].shape)
+        return sample
+    def __len__(self):
+        return len(self.image_files)
+def get_vkitti_loader(data_dir_root, batch_size=1, **kwargs):
+    dataset = VKITTI(data_dir_root)
+    return DataLoader(dataset, batch_size, **kwargs)
+if __name__ == "__main__":
+    loader = get_vkitti_loader(
+        data_dir_root="/home/bhatsf/shortcuts/datasets/vkitti_test")
+    print("Total files", len(loader.dataset))
+    for i, sample in enumerate(loader):
+        print(sample["image"].shape)
+        print(sample["depth"].shape)
+        print(sample["dataset"])
+        print(sample['depth'].min(), sample['depth'].max())
+        if i > 5:
+            break

zoedepth/data/vkitti2.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat
+import os
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+class ToTensor(object):
+    def __init__(self):
+        # self.normalize = transforms.Normalize(
+        #     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        self.normalize = lambda x: x
+        # self.resize = transforms.Resize((375, 1242))
+    def __call__(self, sample):
+        image, depth = sample['image'], sample['depth']
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        depth = self.to_tensor(depth)
+        # image = self.resize(image)
+        return {'image': image, 'depth': depth, 'dataset': "vkitti"}
+    def to_tensor(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img
+        #         # handle PIL Image
+        if pic.mode == 'I':
+            img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+        elif pic.mode == 'I;16':
+            img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+        else:
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+        # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+        if pic.mode == 'YCbCr':
+            nchannel = 3
+        elif pic.mode == 'I;16':
+            nchannel = 1
+        else:
+            nchannel = len(pic.mode)
+        img = img.view(pic.size[1], pic.size[0], nchannel)
+        img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class VKITTI2(Dataset):
+    def __init__(self, data_dir_root, do_kb_crop=True, split="test"):
+        import glob
+        # image paths are of the form <data_dir_root>/rgb/<scene>/<variant>/frames/<rgb,depth>/Camera<0,1>/rgb_{}.jpg
+        self.image_files = glob.glob(os.path.join(
+            data_dir_root, "rgb", "**", "frames", "rgb", "Camera_0", '*.jpg'), recursive=True)
+        self.depth_files = [r.replace("/rgb/", "/depth/").replace(
+            "rgb_", "depth_").replace(".jpg", ".png") for r in self.image_files]
+        self.do_kb_crop = True
+        self.transform = ToTensor()
+        # If train test split is not created, then create one.
+        # Split is such that 8% of the frames from each scene are used for testing.
+        if not os.path.exists(os.path.join(data_dir_root, "train.txt")):
+            import random
+            scenes = set([os.path.basename(os.path.dirname(
+                os.path.dirname(os.path.dirname(f)))) for f in self.image_files])
+            train_files = []
+            test_files = []
+            for scene in scenes:
+                scene_files = [f for f in self.image_files if os.path.basename(
+                    os.path.dirname(os.path.dirname(os.path.dirname(f)))) == scene]
+                random.shuffle(scene_files)
+                train_files.extend(scene_files[:int(len(scene_files) * 0.92)])
+                test_files.extend(scene_files[int(len(scene_files) * 0.92):])
+            with open(os.path.join(data_dir_root, "train.txt"), "w") as f:
+                f.write("\n".join(train_files))
+            with open(os.path.join(data_dir_root, "test.txt"), "w") as f:
+                f.write("\n".join(test_files))
+        if split == "train":
+            with open(os.path.join(data_dir_root, "train.txt"), "r") as f:
+                self.image_files = f.read().splitlines()
+            self.depth_files = [r.replace("/rgb/", "/depth/").replace(
+                "rgb_", "depth_").replace(".jpg", ".png") for r in self.image_files]
+        elif split == "test":
+            with open(os.path.join(data_dir_root, "test.txt"), "r") as f:
+                self.image_files = f.read().splitlines()
+            self.depth_files = [r.replace("/rgb/", "/depth/").replace(
+                "rgb_", "depth_").replace(".jpg", ".png") for r in self.image_files]
+    def __getitem__(self, idx):
+        image_path = self.image_files[idx]
+        depth_path = self.depth_files[idx]
+        image = Image.open(image_path)
+        # depth = Image.open(depth_path)
+        depth = cv2.imread(depth_path, cv2.IMREAD_ANYCOLOR |
+                           cv2.IMREAD_ANYDEPTH) / 100.0  # cm to m
+        depth = Image.fromarray(depth)
+        # print("dpeth min max", depth.min(), depth.max())
+        # print(np.shape(image))
+        # print(np.shape(depth))
+        if self.do_kb_crop:
+            if idx == 0:
+                print("Using KB input crop")
+            height = image.height
+            width = image.width
+            top_margin = int(height - 352)
+            left_margin = int((width - 1216) / 2)
+            depth = depth.crop(
+                (left_margin, top_margin, left_margin + 1216, top_margin + 352))
+            image = image.crop(
+                (left_margin, top_margin, left_margin + 1216, top_margin + 352))
+            # uv = uv[:, top_margin:top_margin + 352, left_margin:left_margin + 1216]
+        image = np.asarray(image, dtype=np.float32) / 255.0
+        # depth = np.asarray(depth, dtype=np.uint16) /1.
+        depth = np.asarray(depth, dtype=np.float32) / 1.
+        depth[depth > 80] = -1
+        depth = depth[..., None]
+        sample = dict(image=image, depth=depth)
+        # return sample
+        sample = self.transform(sample)
+        if idx == 0:
+            print(sample["image"].shape)
+        return sample
+    def __len__(self):
+        return len(self.image_files)
+def get_vkitti2_loader(data_dir_root, batch_size=1, **kwargs):
+    dataset = VKITTI2(data_dir_root)
+    return DataLoader(dataset, batch_size, **kwargs)
+if __name__ == "__main__":
+    loader = get_vkitti2_loader(
+        data_dir_root="/home/bhatsf/shortcuts/datasets/vkitti2")
+    print("Total files", len(loader.dataset))
+    for i, sample in enumerate(loader):
+        print(sample["image"].shape)
+        print(sample["depth"].shape)
+        print(sample["dataset"])
+        print(sample['depth'].min(), sample['depth'].max())
+        if i > 5:
+            break

zoedepth/models/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# MIT License
+# Copyright (c) 2022 Intelligent Systems Lab Org
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# File author: Shariq Farooq Bhat