Spaces:

AndreasLH
/

Weak-Cube-RCNN

Running

App Files Files Community

AndreasLH commited on May 2

Commit

db3da1e

1 Parent(s): dc15a2b

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +28 -0
.gitmodules +9 -0
.vscode/launch.json +100 -0
DATA.md +219 -0
Dockerfile +32 -0
LICENSE.md +906 -0
MODEL_ZOO.md +17 -0
ProposalNetwork/utils/__init__.py +3 -0
ProposalNetwork/utils/conversions.py +50 -0
ProposalNetwork/utils/plane.py +209 -0
ProposalNetwork/utils/spaces.py +328 -0
ProposalNetwork/utils/utils.py +564 -0
README.md +4 -5
VisualiseGT.py +830 -0
app.py +155 -0
configs/Base.yaml +89 -0
configs/Base_Omni3D.yaml +18 -0
configs/Base_Omni3D_2D_only.yaml +20 -0
configs/Base_Omni3D_in.yaml +18 -0
configs/Base_Omni3D_og.yaml +18 -0
configs/Base_Omni3D_out.yaml +18 -0
configs/Base_Omni3D_prof.yaml +18 -0
configs/Omni_combined.yaml +37 -0
configs/category_meta.json +1 -0
configs/cubercnn_DLA34_FPN.yaml +6 -0
configs/cubercnn_ResNet34_FPN.yaml +7 -0
configs/cubercnn_densenet_FPN.yaml +4 -0
configs/cubercnn_mnasnet_FPN.yaml +4 -0
configs/cubercnn_shufflenet_FPN.yaml +4 -0
cubercnn/config/__init__.py +1 -0
cubercnn/config/config.py +187 -0
cubercnn/data/Omni_to_kitti.py +197 -0
cubercnn/data/__init__.py +5 -0
cubercnn/data/build.py +260 -0
cubercnn/data/builtin.py +46 -0
cubercnn/data/dataset_mapper.py +272 -0
cubercnn/data/datasets.py +480 -0
cubercnn/data/filter_ground.py +26 -0
cubercnn/data/generate_depth_maps.py +86 -0
cubercnn/data/generate_ground_segmentations.py +206 -0
cubercnn/evaluation/__init__.py +1 -0
cubercnn/evaluation/omni3d_evaluation.py +1706 -0
cubercnn/modeling/backbone/__init__.py +5 -0
cubercnn/modeling/backbone/densenet.py +64 -0
cubercnn/modeling/backbone/dla.py +507 -0
cubercnn/modeling/backbone/mnasnet.py +63 -0
cubercnn/modeling/backbone/resnet.py +96 -0
cubercnn/modeling/backbone/shufflenet.py +69 -0
cubercnn/modeling/meta_arch/__init__.py +1 -0
cubercnn/modeling/meta_arch/rcnn3d.py +618 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,28 @@

+# folders or files
+detectron2/
+pytorch3d/
+datasets/*
+testing/image_2
+training/image_2
+# .vscode/
+.ipynb_checkpoints/
+.idea/
+output/
+cubercnn/external/
+wandb/
+hpc_logs/
+depth/checkpoints/
+ProposalNetwork/proposals/network_out.pkl
+.vscode/settings.json
+submit.sh
+profiling/
+# filetypes
+*.pyc
+*.mexa64
+*/output/*
+*/output*/*
+*~
+*.so
+#*.ipynb
+ProposalNetwork/proposals/figs/*

.gitmodules ADDED Viewed

	@@ -0,0 +1,9 @@

+[submodule "GroundingDINO"]
+	path = GroundingDINO
+	url = https://github.com/AndreasLH/GroundingDINO
+[submodule "sam-hq"]
+	path = sam-hq
+	url = https://github.com/SysCV/sam-hq.git
+[submodule "Depth-Anything-V2"]
+	path = Depth-Anything-V2
+	url = https://github.com/DepthAnything/Depth-Anything-V2

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,100 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python"    ,
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": []
+        },
+        {
+            "name": "Cube R-CNN Demo",
+            "type": "python",
+            "request": "launch",
+            "program": "demo/demo.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "cubercnn://omni3d/cubercnn_DLA34_FPN.yaml", "--input-folder", "datasets/title", "--threshold", "0.25", "MODEL.WEIGHTS", "cubercnn://omni3d/cubercnn_DLA34_FPN.pth", "OUTPUT_DIR", "output/demo"]
+        },
+        {
+            "name": "Cube R-CNN 2D only",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Base_Omni3D_2D_only.yaml", "MODEL.WEIGHTS", "output/omni3d-2d-only/model_recent.pth", "OUTPUT_DIR", "output/omni3d-2d-only", "log", "False"]
+        },
+        {
+            "name": "Cube R-CNN Time equalised Demo",
+            "type": "python",
+            "request": "launch",
+            "program": "demo/demo.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Base_Omni3D.yaml", "--input-folder", "datasets/coco_examples", "--threshold", "0.25", "MODEL.WEIGHTS", "output/omni_equalised/model_final.pth", "OUTPUT_DIR", "output/demo_time_equal"]
+        },
+        {
+            "name": "Cube R-CNN pseudo gt demo",
+            "type": "python",
+            "request": "launch",
+            "program": "demo/demo.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Base_Omni3D.yaml", "--input-folder", "datasets/title", "--threshold", "0.25", "MODEL.WEIGHTS", "output/omni_pseudo_gt/model_final.pth", "OUTPUT_DIR", "output/demo_pseudogt"]
+        },
+        {
+            "name": "train",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Base_Omni3D.yaml", "OUTPUT_DIR", "output/omni3d_example_run"]
+        },
+        {
+            "name": "resume train",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Base_Omni3D.yaml", "--resume", "OUTPUT_DIR", "output/Baseline_sgd"]
+        },
+        {
+            "name": "eval, train_net pretrained",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--eval-only", "--config-file", "cubercnn://omni3d/cubercnn_DLA34_FPN.yaml", "MODEL.WEIGHTS", "cubercnn://omni3d/cubercnn_DLA34_FPN.pth"]
+        },
+        {
+            "name": "eval, train_net locally trained",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--eval-only", "--config-file", "configs/Base_Omni3D.yaml", "MODEL.WEIGHTS", "output/Baseline_sgd/model_final.pth"]
+        },
+        {
+            "name": "train Cube R-CNN weak loss",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Omni_combined.yaml", "OUTPUT_DIR", "output/omni3d_combined_test", "log", "False", "loss_functions", "['iou', 'z_pseudo_gt_center', 'pose_alignment', 'pose_ground']"]
+        },
+    ]
+}

DATA.md ADDED Viewed

	@@ -0,0 +1,219 @@

+- [Data Preparation](#data-preparation)
+    - [Download Omni3D json](#download-omni3d-json)
+    - [Download Individual Datasets](#download-individual-datasets)
+- [Data Usage](#data-usage)
+    - [Coordinate System](#coordinate-system)
+    - [Annotation Format](#annotation-format)
+    - [Example Loading Data](#example-loading-data)
+# Data Preparation
+The Omni3D dataset is comprised of 6 datasets which have been pre-processed into the same annotation format and camera coordinate systems. To use a subset or the full dataset you must download:
+1. The processed Omni3D json files
+2. RGB images from each dataset separately
+## Download Omni3D json
+Run
+```
+sh datasets/Omni3D/download_omni3d_json.sh
+```
+to download and extract the Omni3D train, val and test json annotation files.
+## Download Individual Datasets
+Below are the instructions for setting up each individual dataset. It is recommended to download only the data you plan to use.
+### KITTI
+Download the left color images from [KITTI's official website](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). Unzip or softlink the images into the root `./Omni3D/` which should have the folder structure as detailed below. Note that we only require the image_2 folder.
+```bash
+datasets/KITTI_object
+└── training
+    ├── image_2
+```
+### nuScenes
+Download the trainval images from the [official nuScenes website](https://www.nuscenes.org/nuscenes#download). Unzip or softlink the images into the root `./Omni3D/` which should have the folder structure as detailed below. Note that we only require the CAM_FRONT folder.
+```bash
+datasets/nuScenes/samples
+└── samples
+    ├── CAM_FRONT
+```
+### Objectron
+Run
+```
+sh datasets/objectron/download_objectron_images.sh
+```
+to download and extract the Objectron pre-processed images (~24 GB).
+### SUN RGB-D
+Download the "SUNRGBD V1" images at [SUN RGB-D's official website](https://rgbd.cs.princeton.edu/). Unzip or softlink the images into the root `./Omni3D/` which should have the folder structure as detailed below.
+```bash
+./Omni3D/datasets/SUNRGBD
+├── kv1
+├── kv2
+├── realsense
+```
+### ARKitScenes
+Run
+```
+sh datasets/ARKitScenes/download_arkitscenes_images.sh
+```
+to download and extract the ARKitScenes pre-processed images (~28 GB).
+### Hypersim
+Follow the [download instructions](https://github.com/apple/ml-hypersim/tree/main/contrib/99991) from [Thomas Germer](https://github.com/99991) in order to download all \*tonemap.jpg preview images in order to avoid downloading the full Hypersim dataset. For example:
+```bash
+git clone https://github.com/apple/ml-hypersim
+cd ml-hypersim/
+python contrib/99991/download.py -c .tonemap.jpg -d /path/to/Omni3D/datasets/hypersim --silent
+```
+Then arrange or unzip the downloaded images into the root `./Omni3D/` so that it has the below folder structure.
+```bash
+datasets/hypersim/
+├── ai_001_001
+├── ai_001_002
+├── ai_001_003
+├── ai_001_004
+├── ai_001_005
+├── ai_001_006
+...
+```
+# Data Usage
+Below we describe the unified 3D annotation coordinate systems, annotation format, and an example script.
+## Coordinate System
+All 3D annotations are provided in a shared camera coordinate system with
++x right, +y down, +z toward screen.
+The vertex order of bbox3D_cam:
+```
+                v4_____________________v5
+                /|                    /|
+               / |                   / |
+              /  |                  /  |
+             /___|_________________/   |
+          v0|    |                 |v1 |
+            |    |                 |   |
+            |    |                 |   |
+            |    |                 |   |
+            |    |_________________|___|
+            |   / v7               |   /v6
+            |  /                   |  /
+            | /                    | /
+            |/_____________________|/
+            v3                     v2
+```
+## Annotation Format
+Each dataset is formatted as a dict in python in the below format.
+```python
+dataset {
+    "info"			: info,
+    "images"			: [image],
+    "categories"		: [category],
+    "annotations"		: [object],
+}
+info {
+	"id"			: str,
+	"source"		: int,
+	"name"			: str,
+	"split"			: str,
+	"version"		: str,
+	"url"			: str,
+}
+image {
+	"id"			: int,
+	"dataset_id"		: int,
+	"width"			: int,
+	"height"		: int,
+	"file_path"		: str,
+	"K"			: list (3x3),
+	"src_90_rotate"		: int,					# im was rotated X times, 90 deg counterclockwise
+	"src_flagged"		: bool,					# flagged as potentially inconsistent sky direction
+}
+category {
+	"id"			: int,
+	"name"			: str,
+	"supercategory"	: str
+}
+object {
+	"id"			: int,					# unique annotation identifier
+	"image_id"		: int,					# identifier for image
+	"category_id"		: int,					# identifier for the category
+	"category_name"		: str,					# plain name for the category
+	# General 2D/3D Box Parameters.
+	# Values are set to -1 when unavailable.
+	"valid3D"		: bool,				        # flag for no reliable 3D box
+	"bbox2D_tight"		: [x1, y1, x2, y2],			# 2D corners of annotated tight box
+	"bbox2D_proj"		: [x1, y1, x2, y2],			# 2D corners projected from bbox3D
+	"bbox2D_trunc"		: [x1, y1, x2, y2],			# 2D corners projected from bbox3D then truncated
+	"bbox3D_cam"		: [[x1, y1, z1]...[x8, y8, z8]]		# 3D corners in meters and camera coordinates
+	"center_cam"		: [x, y, z],				# 3D center in meters and camera coordinates
+	"dimensions"		: [width, height, length],		# 3D attributes for object dimensions in meters
+	"R_cam"			: list (3x3),				# 3D rotation matrix to the camera frame rotation
+	# Optional dataset specific properties,
+	# used mainly for evaluation and ignore.
+	# Values are set to -1 when unavailable.
+	"behind_camera"		: bool,					# a corner is behind camera
+	"visibility"		: float, 				# annotated visibility 0 to 1
+	"truncation"		: float, 				# computed truncation 0 to 1
+	"segmentation_pts"	: int, 					# visible instance segmentation points
+	"lidar_pts" 		: int, 					# visible LiDAR points in the object
+	"depth_error"		: float,				# L1 of depth map and rendered object
+}
+```
+## Example Loading Data
+Each dataset is named as "Omni3D_{name}_{split}.json" where split can be train, val, or test.
+The annotations are in a COCO-like format such that if you load the json from the Omni3D class which inherits the [COCO class](https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L70), you can use basic COCO dataset functions as demonstrated with the below code.
+```python
+from cubercnn import data
+dataset_paths_to_json = ['path/to/Omni3D/{name}_{split}.json', ...]
+# Example 1. load all images
+dataset = data.Omni3D(dataset_paths_to_json)
+imgIds = dataset.getImgIds()
+imgs = dataset.loadImgs(imgIds)
+# Example 2. load annotations for image index 0
+annIds = dataset.getAnnIds(imgIds=imgs[0]['id'])
+anns = dataset.loadAnns(annIds)
+```

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# Base image, have to use the full version to use the git features
+FROM python:3.12
+# https://huggingface.co/docs/hub/spaces-sdks-docker-first-demo
+# RUN apt-get install -y git
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+COPY ./pre-requirements.txt /code/pre-requirements.txt
+COPY ./GroundingDINO /code/GroundingDINO
+COPY ./sam-hq /code/sam-hq
+RUN pip install --no-cache-dir -r /code/pre-requirements.txt
+RUN pip install --no-cache-dir -r /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python", "app.py"]

LICENSE.md ADDED Viewed

	@@ -0,0 +1,906 @@

+- [Omni3D and Cube R-CNN License](#omni3d-and-cube-r-cnn-license)
+- [ARKitScenes License](#arkitscenes-license)
+- [Objectron License](#objectron-license)
+# Omni3D and Cube R-CNN License
+https://github.com/facebookresearch/omni3d
+https://github.com/facebookresearch/omni3d/blob/main/LICENSE.md
+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.
+# ARKitScenes License
+https://github.com/apple/ARKitScenes/
+https://github.com/apple/ARKitScenes/blob/main/LICENSE
+Attribution-NonCommercial-ShareAlike 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. BY-NC-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution, NonCommercial, and ShareAlike.
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  k. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  l. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  m. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  n. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+  b. ShareAlike.
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-NC-SA Compatible License.
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.
+# Objectron License
+https://github.com/google-research-datasets/Objectron
+https://github.com/google-research-datasets/Objectron/blob/main/LICENSE
+# Computational Use of Data Agreement v1.0
+This is the Computational Use of Data Agreement, Version 1.0 (the “C-UDA”). Capitalized terms are defined in Section 5. Data Provider and you agree as follows:
+1. **Provision of the Data**
+    1.1. You may use, modify, and distribute the Data made available to you by the Data Provider under this C-UDA for Computational Use if you follow the C-UDA's terms.
+    1.2. Data Provider will not sue you or any Downstream Recipient for any claim arising out of the use, modification, or distribution of the Data provided you meet the terms of the C-UDA.
+    1.3 This C-UDA does not restrict your use, modification, or distribution of any portions of the Data that are in the public domain or that may be used, modified, or distributed under any other legal exception or limitation.
+2. **Restrictions**
+    2.1  You agree that you will use the Data solely for Computational Use.
+	  2.2 The C-UDA does not impose any restriction with respect to the use, modification, or distribution of Results.
+3.	**Redistribution of Data**
+    3.1. You may redistribute the Data, so long as:
+      3.1.1. You include with any Data you redistribute all credit or attribution information that you received with the Data, and your terms require any Downstream Recipient to do the same; and
+      3.1.2. You bind each recipient to whom you redistribute the Data to the terms of the C-UDA.
+4.	**No Warranty, Limitation of Liability**
+    4.1. Data Provider does not represent or warrant that it has any rights whatsoever in the Data.
+    4.2. THE DATA IS PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+    4.3. NEITHER DATA PROVIDER NOR ANY UPSTREAM DATA PROVIDER SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE DATA OR RESULTS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+5.	**Definitions**
+    5.1. “Computational Use” means activities necessary to enable the use of Data (alone or along with other material) for analysis by a computer.
+    5.2.“Data” means the material you receive under the C-UDA in modified or unmodified form, but not including Results.
+    5.3. “Data Provider” means the source from which you receive the Data and with whom you enter into the C-UDA.
+    5.4. “Downstream Recipient” means any person or persons who receives the Data directly or indirectly from you in accordance with the C-UDA.
+    5.5. “Result” means anything that you develop or improve from your use of Data that does not include more than a de minimis portion of the Data on which the use is based. Results may include de minimis portions of the Data necessary to report on or explain use that has been conducted with the Data, such as figures in scientific papers, but do not include more. Artificial intelligence models trained on Data (and which do not include more than a de minimis portion of Data) are Results.
+    5.6. “Upstream Data Providers” means the source or sources from which the Data Provider directly or indirectly received, under the terms of the C-UDA, material that is included in the Data.

MODEL_ZOO.md ADDED Viewed

	@@ -0,0 +1,17 @@

+# Cube R-CNN Model Zoo on Omni3D
+## Models
+We provide a model zoo for models trained on Omni3D data splits (see paper for more details).
+|         |        Omni3D             |     Omni3D (Indoor only)     |     Omni3D (Outdoor only)    |
+|---------|:-------------------------:|:----------------------------:|:----------------------------:|
+| `res34` |  [omni3d/cubercnn_Res34_FPN.pth][res34_omni]  |   [indoor/cubercnn_Res34_FPN.pth][res34_in]  |   [outdoor/cubercnn_Res34_FPN.pth][res34_out]  |
+| `dla34` |   [omni3d/cubercnn_DLA34_FPN.pth][dla34_omni]  |   [indoor/cubercnn_DLA34_FPN.pth][dla34_in]  |   [outdoor/cubercnn_DLA34_FPN.pth][dla34_out]  |
+[dla34_omni]: https://dl.fbaipublicfiles.com/cubercnn/omni3d/cubercnn_DLA34_FPN.pth
+[dla34_in]: https://dl.fbaipublicfiles.com/cubercnn/indoor/cubercnn_DLA34_FPN.pth
+[dla34_out]: https://dl.fbaipublicfiles.com/cubercnn/outdoor/cubercnn_DLA34_FPN.pth
+[res34_omni]: https://dl.fbaipublicfiles.com/cubercnn/omni3d/cubercnn_Res34_FPN.pth
+[res34_in]: https://dl.fbaipublicfiles.com/cubercnn/indoor/cubercnn_Res34_FPN.pth
+[res34_out]: https://dl.fbaipublicfiles.com/cubercnn/outdoor/cubercnn_Res34_FPN.pth

ProposalNetwork/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .spaces import *
+from .conversions import *
+from .utils import *

ProposalNetwork/utils/conversions.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import numpy as np
+from detectron2.structures import Boxes
+def cube_to_box(cube,K):
+    '''
+    Converts a Cube to a Box.
+    Args:
+        cube: A Cube.
+        K: The 3D camera matrix of the box.
+    Returns:
+        A Box.
+    '''
+    bube_corners = cube.get_bube_corners(K)
+    min_x = torch.min(bube_corners[:,0])
+    max_x = torch.max(bube_corners[:,0])
+    min_y = torch.min(bube_corners[:,1])
+    max_y = torch.max(bube_corners[:,1])
+    return Boxes(torch.tensor([[min_x, min_y, max_x, max_y]], device=cube.tensor.device))
+def cubes_to_box(cubes, K, im_shape):
+    '''
+    Converts a Cubes to a Boxes.
+    Args:
+        cubes: A Cubes.
+        K: The 3D camera matrix of the box.
+        im_shape: The shape of the image (width, height).
+    Returns:
+        A Box.
+    '''
+    bube_corners = cubes.get_bube_corners(K, im_shape)
+    min_x, _ = torch.min(bube_corners[:, :, :, 0], 2)
+    max_x, _ = torch.max(bube_corners[:, :, :, 0], 2)
+    min_y, _ = torch.min(bube_corners[:, :, :, 1], 2)
+    max_y, _ = torch.max(bube_corners[:, :, :, 1], 2)
+    values = torch.stack((min_x, min_y, max_x, max_y),dim=2)
+    box_list = []
+    for i in range(cubes.num_instances):
+        box_list.append(Boxes(values[i]))
+    return box_list

ProposalNetwork/utils/plane.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import random
+import torch
+import numpy as np
+class Plane_torch:
+    """
+    Implementation of planar RANSAC.
+    Class for Plane object, which finds the equation of a infinite plane using RANSAC algorithim.
+    Call `fit(.)` to randomly take 3 points of pointcloud to verify inliers based on a threshold.
+    ![Plane](https://raw.githubusercontent.com/leomariga/pyRANSAC-3D/master/doc/plano.gif "Plane")
+    ---
+    """
+    def __init__(self):
+        self.inliers = []
+        self.equation = []
+    def fit(self, pts, thresh=0.05, minPoints=100, maxIteration=1000):
+        """
+        Find the best equation for a plane.
+        :param pts: 3D point cloud as a `torch.Tensor (N,3)`.
+        :param thresh: Threshold distance from the plane which is considered inlier.
+        :param maxIteration: Number of maximum iteration which RANSAC will loop over.
+        :returns:
+        - `self.equation`:  Parameters of the plane using Ax+By+Cy+D `torch.Tensor(4)`
+        - `self.inliers`: points from the dataset considered inliers
+        ---
+        """
+        n_points = pts.shape[0]
+        best_eq = []
+        best_inliers = []
+        for it in range(maxIteration):
+            # Samples 3 random points
+            id_samples = torch.randperm(n_points)[:3]
+            pt_samples = pts[id_samples]
+            # We have to find the plane equation described by those 3 points
+            # We find first 2 vectors that are part of this plane
+            # A = pt2 - pt1
+            # B = pt3 - pt1
+            vecA = pt_samples[1, :] - pt_samples[0, :]
+            vecB = pt_samples[2, :] - pt_samples[0, :]
+            # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane
+            vecC = torch.cross(vecA, vecB)
+            # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k
+            # We have to use a point to find k
+            vecC = vecC / torch.norm(vecC, p=2)
+            k = -torch.sum(torch.mul(vecC, pt_samples[1, :]))
+            plane_eq = torch.tensor([vecC[0], vecC[1], vecC[2], k])
+            # Distance from a point to a plane
+            # https://mathworld.wolfram.com/Point-PlaneDistance.html
+            pt_id_inliers = []  # list of inliers ids
+            dist_pt = (
+                plane_eq[0] * pts[:, 0] + plane_eq[1] * pts[:, 1] + plane_eq[2] * pts[:, 2] + plane_eq[3]
+            ) / torch.sqrt(plane_eq[0] ** 2 + plane_eq[1] ** 2 + plane_eq[2] ** 2)
+            # Select indexes where distance is smaller than the threshold
+            pt_id_inliers = torch.where(torch.abs(dist_pt) <= thresh)[0]
+            if len(pt_id_inliers) > len(best_inliers):
+                best_eq = plane_eq
+                best_inliers = pt_id_inliers
+            self.inliers = best_inliers
+            self.equation = best_eq
+        return -self.equation, self.inliers
+    def fit_parallel(self, pts:torch.Tensor, thresh=0.05, minPoints=100, maxIteration=1000):
+        """
+        Find the best equation for a plane.
+        :param pts: 3D point cloud as a `torch.Tensor (N,3)`.
+        :param thresh: Threshold distance from the plane which is considered inlier.
+        :param maxIteration: Number of maximum iteration which RANSAC will loop over.
+        :returns:
+        - `self.equation`:  Parameters of the plane using Ax+By+Cy+D `torch.Tensor(4)`
+        - `self.inliers`: points from the dataset considered inliers
+        ---
+        """
+        n_points = pts.shape[0]
+        # Samples shape (maxIteration, 3) random points
+        id_samples = torch.tensor([random.sample(range(0, n_points), 3) for _ in range(maxIteration)],device=pts.device)
+        pt_samples = pts[id_samples]
+        # We have to find the plane equation described by those 3 points
+        # We find first 2 vectors that are part of this plane
+        # A = pt2 - pt1
+        # B = pt3 - pt1
+        vecA = pt_samples[:, 1, :] - pt_samples[:, 0, :]
+        vecB = pt_samples[:, 2, :] - pt_samples[:, 0, :]
+        # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane
+        vecC = torch.cross(vecA, vecB, dim=-1)
+        # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k
+        # We have to use a point to find k
+        vecC = vecC / torch.norm(vecC, p=2, dim=1, keepdim=True)
+        k = -torch.sum(torch.mul(vecC, pt_samples[:, 1, :]), dim=1)
+        plane_eqs = torch.column_stack([vecC[:, 0], vecC[:, 1], vecC[:, 2], k])
+        # Distance from a point to a plane
+        # https://mathworld.wolfram.com/Point-PlaneDistance.html
+        dist_pt = (
+            plane_eqs[:,0].unsqueeze(1) * pts[:, 0] + plane_eqs[:,1].unsqueeze(1) * pts[:, 1] + plane_eqs[:,2].unsqueeze(1) * pts[:, 2] + plane_eqs[:,3].unsqueeze(1)
+        ) / torch.sqrt(plane_eqs[:,0] ** 2 + plane_eqs[:,1] ** 2 + plane_eqs[:,2] ** 2).unsqueeze(1)
+        # Select indexes where distance is smaller than the threshold
+        # maxIteration x n_points
+        # row with most inliers
+        pt_id_inliers = torch.abs(dist_pt) <= thresh
+        counts = torch.sum(pt_id_inliers, dim=1)
+        best_eq = plane_eqs[torch.argmax(counts)]
+        best_inliers_id = pt_id_inliers[torch.argmax(counts)]
+        # convert boolean tensor to indices
+        best_inliers = torch.where(best_inliers_id)[0]
+        self.inliers = best_inliers
+        self.equation = best_eq
+        return -self.equation, self.inliers
+class Plane_np:
+    """
+    Implementation of planar RANSAC.
+    Class for Plane object, which finds the equation of a infinite plane using RANSAC algorithim.
+    Call `fit(.)` to randomly take 3 points of pointcloud to verify inliers based on a threshold.
+    ![Plane](https://raw.githubusercontent.com/leomariga/pyRANSAC-3D/master/doc/plano.gif "Plane")
+    ---
+    """
+    def __init__(self):
+        self.inliers = []
+        self.equation = []
+    def fit(self, pts, thresh=0.05, minPoints=100, maxIteration=1000):
+        """
+        Find the best equation for a plane.
+        :param pts: 3D point cloud as a `np.array (N,3)`.
+        :param thresh: Threshold distance from the plane which is considered inlier.
+        :param maxIteration: Number of maximum iteration which RANSAC will loop over.
+        :returns:
+        - `self.equation`:  Parameters of the plane using Ax+By+Cy+D `np.array (1, 4)`
+        - `self.inliers`: points from the dataset considered inliers
+        ---
+        """
+        n_points = pts.shape[0]
+        best_eq = []
+        best_inliers = []
+        for it in range(maxIteration):
+            # Samples 3 random points
+            id_samples = random.sample(range(0, n_points), 3)
+            pt_samples = pts[id_samples]
+            # We have to find the plane equation described by those 3 points
+            # We find first 2 vectors that are part of this plane
+            # A = pt2 - pt1
+            # B = pt3 - pt1
+            vecA = pt_samples[1, :] - pt_samples[0, :]
+            vecB = pt_samples[2, :] - pt_samples[0, :]
+            # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane
+            vecC = np.cross(vecA, vecB)
+            # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k
+            # We have to use a point to find k
+            vecC = vecC / np.linalg.norm(vecC)
+            k = -np.sum(np.multiply(vecC, pt_samples[1, :]))
+            plane_eq = [vecC[0], vecC[1], vecC[2], k]
+            # Distance from a point to a plane
+            # https://mathworld.wolfram.com/Point-PlaneDistance.html
+            pt_id_inliers = []  # list of inliers ids
+            dist_pt = (
+                plane_eq[0] * pts[:, 0] + plane_eq[1] * pts[:, 1] + plane_eq[2] * pts[:, 2] + plane_eq[3]
+            ) / np.sqrt(plane_eq[0] ** 2 + plane_eq[1] ** 2 + plane_eq[2] ** 2)
+            # Select indexes where distance is biggers than the threshold
+            pt_id_inliers = np.where(np.abs(dist_pt) <= thresh)[0]
+            if len(pt_id_inliers) > len(best_inliers):
+                best_eq = plane_eq
+                best_inliers = pt_id_inliers
+            self.inliers = best_inliers
+            self.equation = best_eq
+        return self.equation, self.inliers

ProposalNetwork/utils/spaces.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import numpy as np
+import torch
+from cubercnn import util
+'''
+coordinate system is assumed to have origin in the upper left
+(0,0) _________________(N,0)
+|
+|
+|
+|
+|
+(0,M)
+'''
+"""
+class Cube:
+    '''
+    3D box in the format [c1, c2, c3, w, h, l, R]
+    Args:
+        c1: The x coordinate of the center of the box.
+        c2: The y coordinate of the center of the box.
+        c3: The z coordinate of the center of the box.
+        w: The width of the box in meters.
+        h: The height of the box in meters.
+        l: The length of the box in meters.
+        R: The 3D rotation matrix of the box.
+    ```
+                      _____________________
+                    /|                    /|
+                   / |                   / |
+                  /  |                  /  |
+                 /___|_________________/   |
+                |    |                 |   | h
+                |    |                 |   |
+                |    |                 |   |
+                |    |   (c1,c2,c3)    |   |
+                |    |_________________|___|
+                |   /                  |   /
+                |  /                   |  /
+                | /                    | / l
+                |/_____________________|/
+                            w
+    ```
+    '''
+    def __init__(self,tensor: torch.Tensor, R: torch.Tensor, score=None, label=None) -> None:
+        self.tensor = tensor
+        self.center = tensor[:3]
+        self.dimensions = tensor[3:6]
+        self.rotation = R
+        # score and label are meant as auxiliary information
+        self.score = score
+        self.label = label
+    def get_cube(self):
+        color = [c/255.0 for c in util.get_color()]
+        return util.mesh_cuboid(torch.cat((self.center,self.dimensions)), self.rotation, color=color)
+    def get_all_corners(self):
+        '''wrap ``util.get_cuboid_verts_faces``
+        Returns:
+            verts: the 3D vertices of the cuboid in camera space'''
+        verts, _ = util.get_cuboid_verts_faces(torch.cat((self.center,self.dimensions)), self.rotation)
+        return verts
+    def get_bube_corners(self,K) -> torch.Tensor:
+        cube_corners = self.get_all_corners()
+        cube_corners = torch.mm(K, cube_corners.t()).t()
+        return cube_corners[:,:2]/cube_corners[:,2].unsqueeze(1)
+    def get_volume(self) -> float:
+        return self.dimensions.prod().item()
+    def __repr__(self) -> str:
+        return f'Cube({self.center}, {self.dimensions}, {self.rotation})'
+    def to_device(self, device):
+        '''
+        Move all tensors of the instantiated class to the specified device.
+        Args:
+            device: The device to move the tensors to (e.g., 'cuda', 'cpu').
+        '''
+        self.tensor = self.tensor.to(device)
+        self.center = self.center.to(device)
+        self.dimensions = self.dimensions.to(device)
+        self.rotation = self.rotation.to(device)
+        return self
+"""
+class Cubes:
+    '''
+    3D boxes in the format [[c1, c2, c3, w, h, l, R1...R9]]
+    inspired by `detectron2.structures.Boxes`
+    Args:
+        tensor: torch.tensor(
+            c1: The x coordinates of the center of the boxes.
+            c2: The y coordinates of the center of the boxes.
+            c3: The z coordinates of the center of the boxes.
+            w: The width of the boxes in meters.
+            h: The height of the boxes in meters.
+            l: The length of the boxes in meters.
+            R: The flattened 3D rotation matrix of the boxes (i.e. the rows are next to each other).
+            )
+            of shape (N, 15).
+    ```
+                      _____________________
+                    /|                    /|
+                   / |                   / |
+                  /  |                  /  |
+                 /___|_________________/   |
+                |    |                 |   | h
+                |    |                 |   |
+                |    |                 |   |
+                |    |   (c1,c2,c3)    |   |
+                |    |_________________|___|
+                |   /                  |   /
+                |  /                   |  /
+                | /                    | / l
+                |/_____________________|/
+                            w
+    ```
+    '''
+    def __init__(self,tensor: torch.Tensor, scores=None, labels=None) -> None:
+        # score and label are meant as auxiliary information
+        if scores is not None:
+            assert scores.ndim == 2, f"scores.shape must be (n_instances, n_proposals), but was {scores.shape}"
+        self.scores = scores
+        self.labels = labels
+        if not isinstance(tensor, torch.Tensor):
+            if not isinstance(tensor, np.ndarray):
+                tensor = np.asarray(tensor)
+            tensor = torch.as_tensor(tensor, dtype=torch.float32, device=torch.device("cpu"))
+        else:
+            tensor = tensor.to(torch.float32)
+        if tensor.numel() == 0:
+            tensor = tensor.reshape((-1, 15)).to(dtype=torch.float32)
+        self.tensor = tensor
+        if self.tensor.dim() == 1:
+            self.tensor = self.tensor.unsqueeze(0)
+        if self.tensor.dim() == 2:
+            self.tensor = self.tensor.unsqueeze(0)
+    @property
+    def centers(self):
+        return self.tensor[:, :, :3]
+    @property
+    def dimensions(self):
+        return self.tensor[:, :, 3:6]
+    @property
+    def rotations(self):
+        shape = self.tensor.shape
+        return self.tensor[:, :, 6:].reshape(shape[0],shape[1], 3, 3)
+    @property
+    def device(self):
+        return self.tensor.device
+    @property
+    def num_instances(self):
+        return self.tensor.shape[0]
+    @property
+    def shape(self):
+        return self.tensor.shape
+    def clone(self) -> "Cubes":
+        """
+        Clone the Cubes.
+        Returns:
+            Cubes
+        """
+        return Cubes(self.tensor.clone())
+    def get_cubes(self):
+        color = [c/255.0 for c in util.get_color()]
+        return util.mesh_cuboid(torch.cat((self.centers.squeeze(0),self.dimensions.squeeze(0)),dim=1), self.rotations.squeeze(0), color=color)
+    def get_all_corners(self):
+        '''wrap ``util.get_cuboid_verts_faces``
+        Returns:
+            verts: the 3D vertices of the cuboid in camera space'''
+        verts_list = []
+        for i in range(self.num_instances):
+            verts_next_instance, _ = util.get_cuboid_verts_faces(self.tensor[i, :, :6], self.rotations[i])
+            verts_list.append(verts_next_instance)
+        verts = torch.stack(verts_list, dim=0)
+        return verts
+    def get_cuboids_verts_faces(self):
+        '''wrap ``util.get_cuboid_verts_faces``
+        Returns:
+            verts: the 3D vertices of the cuboid in camera space
+            faces: the faces of the cuboid in camera space'''
+        verts_list = []
+        faces_list = []
+        for i in range(self.num_instances):
+            verts_next_instance, faces = util.get_cuboid_verts_faces(self.tensor[i, :, :6], self.rotations[i])
+            verts_list.append(verts_next_instance)
+            faces_list.append(faces)
+        verts = torch.stack(verts_list, dim=0)
+        faces = torch.stack(faces_list, dim=0)
+        return verts, faces
+    def get_bube_corners(self, K, clamp:tuple=None) -> torch.Tensor:
+        '''This assumes that all the cubes have the same camera intrinsic matrix K
+        clamp is a typically the image shape (width, height) to truncate the boxes to image frame, this avoids huge projected boxes
+        Returns:
+            num_instances x N x 8 x 2'''
+        cube_corners = self.get_all_corners() # num_instances x N x 8 x 3
+        num_prop = cube_corners.shape[1]
+        cube_corners = cube_corners.reshape(self.num_instances * num_prop, 8, 3)
+        K_repeated = K.repeat(self.num_instances * num_prop,1,1)
+        cube_corners = torch.matmul(K_repeated, cube_corners.transpose(2,1))
+        cube_corners = cube_corners[:, :2, :]/cube_corners[:, 2, :].unsqueeze(-2)
+        cube_corners = cube_corners.transpose(2,1)
+        cube_corners = cube_corners.reshape(self.num_instances, num_prop, 8, 2)
+        # we must clamp and then stack, otherwise the gradient is fucked
+        if clamp is not None:
+            x = torch.clamp(cube_corners[..., 0], int(-clamp[0]/2+1), int(clamp[0]-1+clamp[0]))
+            y = torch.clamp(cube_corners[..., 1], int(-clamp[1]/2+1), int(clamp[1]-1+clamp[1]))
+        cube_corners = torch.stack((x, y), dim=-1)
+        return cube_corners # num_instances x num_proposals x 8 x 2
+    def get_volumes(self) -> float:
+        return self.get_dimensions().prod(1).item()
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+    def __repr__(self) -> str:
+        return f'Cubes({self.tensor})'
+    def to(self, device: torch.device):
+        # Cubes are assumed float32 and does not support to(dtype)
+        if isinstance(self.scores, torch.Tensor):
+            self.scores = self.scores.to(device=device)
+        if isinstance(self.labels, torch.Tensor):
+            self.labels = self.labels.to(device=device)
+        return Cubes(self.tensor.to(device=device), self.scores, self.labels)
+    def __getitem__(self, item) -> "Cubes":
+        """
+        Args:
+            item: int, slice, or a BoolTensor
+        Returns:
+            Cubes: Create a new :class:`Cubes` by indexing.
+        The following usage are allowed:
+        1. `new_cubes = cubes[3]`: return a `Cubes` which contains only one box.
+        2. `new_cubes = cubes[2:10]`: return a slice of cubes.
+        3. `new_cubes = cubes[vector]`, where vector is a torch.BoolTensor
+           with `length = len(cubes)`. Nonzero elements in the vector will be selected.
+        Note that the returned Cubes might share storage with this Cubes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            prev_n_prop = self.tensor.shape[1]
+            return Cubes(self.tensor[item].view(1, prev_n_prop, -1))
+        elif isinstance(item, tuple):
+            return Cubes(self.tensor[item[0],item[1]].view(1, 1, -1))
+        b = self.tensor[item]
+        assert b.dim() == 2, "Indexing on Cubes with {} failed to return a matrix!".format(item)
+        return Cubes(b)
+    @classmethod
+    def cat(cls, cubes_list: list["Cubes"]) -> "Cubes":
+        """
+        Concatenates a list of Cubes into a single Cubes
+        Arguments:
+            cubes_list (list[Cubes])
+        Returns:
+            Cubes: the concatenated Cubes
+        """
+        assert isinstance(cubes_list, (list, tuple))
+        if len(cubes_list) == 0:
+            return cls(torch.empty(0))
+        assert all([isinstance(box, Cubes) for box in cubes_list])
+        # use torch.cat (v.s. layers.cat) so the returned cubes never share storage with input
+        cat_cubes = cls(torch.cat([b.tensor for b in cubes_list], dim=0))
+        return cat_cubes
+    @torch.jit.unused
+    def __iter__(self):
+        """
+        Yield a cube as a Tensor of shape (15,) at a time.
+        """
+        yield from self.tensor
+    def split(self, split_size: int, dim=1) -> tuple["Cubes"]:
+        """same behaviour as torch.split, return a tuple of chunksize Cubes"""
+        return tuple(Cubes(x) for x in self.tensor.split(split_size, dim=dim))
+    def reshape(self, *args) -> "Cubes":
+        """
+        Returns:
+            Cubes: reshaped Cubes
+        """
+        return Cubes(self.tensor.reshape(*args), self.scores, self.labels)

ProposalNetwork/utils/utils.py ADDED Viewed

	@@ -0,0 +1,564 @@

+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from detectron2.structures import pairwise_iou
+from pytorch3d.ops import box3d_overlap
+##### Proposal
+def normalize_vector(v):
+    v_mag = torch.sqrt(v.pow(2).sum())
+    v_mag = torch.max(v_mag, torch.tensor([1e-8], device=v.device))
+    v_mag = v_mag.view(1,1).expand(1,v.shape[0])
+    v = v/v_mag
+    return v[0]
+def cross_product(u, v):
+    i = u[1]*v[2] - u[2]*v[1]
+    j = u[2]*v[0] - u[0]*v[2]
+    k = u[0]*v[1] - u[1]*v[0]
+    out = torch.cat((i.view(1,1), j.view(1,1), k.view(1,1)),1)
+    return out[0]
+def compute_rotation_matrix_from_ortho6d(poses):
+    x_raw = poses[0:3]
+    y_raw = poses[3:6]
+    x = normalize_vector(x_raw)
+    z = cross_product(x,y_raw)
+    z = normalize_vector(z)
+    y = cross_product(z,x)
+    x = x.view(-1,3,1)
+    y = y.view(-1,3,1)
+    z = z.view(-1,3,1)
+    matrix = torch.cat((x,y,z), 2)[0]
+    return matrix
+def sample_normal_in_range(means, stds, count, threshold_low=None, threshold_high=None):
+    device = means.device
+    # Generate samples from a normal distribution
+    samples = torch.normal(means.unsqueeze(1).expand(-1,count), stds.unsqueeze(1).expand(-1,count))
+    # Ensure that all samples are greater than threshold_low and less than threshold_high
+    if threshold_high is not None and threshold_low is not None:
+        tries = 0
+        threshold_high = threshold_high.unsqueeze(1).expand_as(samples)
+        while torch.any((samples < threshold_low) | (samples > threshold_high)):
+            invalid_mask = (samples < threshold_low) | (samples > threshold_high)
+            # Replace invalid samples with new samples drawn from the normal distribution, could be done more optimal by sampling only sum(invalid mask) new samples, but matching of correct means is difficult then
+            samples[invalid_mask] = torch.normal(means.unsqueeze(1).expand(-1,count), stds.unsqueeze(1).expand(-1,count))[invalid_mask]
+            tries += 1
+            if tries == 10000:
+                break
+    return samples.to(device)
+def randn_orthobasis_torch(num_samples=1,num_instances=1):
+    z = torch.randn(num_instances, num_samples, 3, 3)
+    z = z / torch.norm(z, p=2, dim=-1, keepdim=True)
+    z[:, :, 0] = torch.cross(z[:, :, 1], z[:, :, 2], dim=-1)
+    z[:, :, 0] = z[:, :, 0] / torch.norm(z[:, :, 0], dim=-1, keepdim=True)
+    z[:, :, 1] = torch.cross(z[:, :, 2], z[:, :, 0], dim=-1)
+    z[:, :, 1] = z[:, :, 1] / torch.norm(z[:, :, 1], dim=-1, keepdim=True)
+    return z
+def randn_orthobasis(num_samples=1):
+    z = np.random.randn(num_samples, 3, 3)
+    z = z / np.linalg.norm(z, axis=-1, keepdims=True)
+    z[:, 0] = np.cross(z[:, 1], z[:, 2], axis=-1)
+    z[:, 0] = z[:, 0] / np.linalg.norm(z[:, 0], axis=-1, keepdims=True)
+    z[:, 1] = np.cross(z[:, 2], z[:, 0], axis=-1)
+    z[:, 1] = z[:, 1] / np.linalg.norm(z[:, 1], axis=-1, keepdims=True)
+    return z
+# ##things for making rotations
+def vec_perp(vec):
+    '''generate a vector perpendicular to vec in 3d'''
+    # https://math.stackexchange.com/a/2450825
+    a, b, c = vec
+    if a == 0:
+        return np.array([0,c,-b])
+    return np.array(normalize_vector(torch.tensor([b,-a,0])))
+def orthobasis_from_normal(normal, yaw_angle=0):
+    '''generate an orthonormal/Rotation matrix basis from a normal vector in 3d
+       returns a 3x3 matrix with the basis vectors as columns, 3rd column is the original normal vector
+    '''
+    x = rotate_vector(vec_perp(normal), normal, yaw_angle)
+    x = x / np.linalg.norm(x, ord=2)
+    y = np.cross(normal, x)
+    return np.array([x, normal, y]).T # the vectors should be as columns
+def rotate_vector(v, k, theta):
+    '''rotate a vector v around an axis k by an angle theta
+    it is assumed that k is a unit vector (p2 norm = 1)'''
+    # https://medium.com/@sim30217/rodrigues-rotation-formula-47489db49050
+    cos_theta = np.cos(theta)
+    sin_theta = np.sin(theta)
+    term1 = v * cos_theta
+    term2 = np.cross(k, v) * sin_theta
+    term3 = k * np.dot(k, v) * (1 - cos_theta)
+    return term1 + term2 + term3
+def vec_perp_t(vec):
+    '''generate a vector perpendicular to vec in 3d'''
+    # https://math.stackexchange.com/a/2450825
+    a, b, c = vec
+    if a == 0:
+        return torch.tensor([0,c,-b], device=vec.device)
+    return normalize_vector(torch.tensor([b,-a,0], device=vec.device))
+def orthobasis_from_normal_t(normal:torch.Tensor, yaw_angles:torch.Tensor=0):
+    '''generate an orthonormal/Rotation matrix basis from a normal vector in 3d
+        normal is assumed to be normalised
+       returns a (no. of yaw_angles)x3x3 matrix with the basis vectors as columns, 3rd column is the original normal vector
+    '''
+    n = len(yaw_angles)
+    x = rotate_vector_t(vec_perp_t(normal), normal, yaw_angles)
+    # x = x / torch.norm(x, p=2)
+    y = torch.cross(normal.view(-1,1), x)
+    # y = y / torch.norm(y, p=2, dim=1)
+    return torch.cat([x.t(), normal.unsqueeze(0).repeat(n, 1), y.t()],dim=1).reshape(n,3,3).transpose(2,1) # the vectors should be as columns
+def rotate_vector_t(v, k, theta):
+    '''rotate a vector v around an axis k by an angle theta
+    it is assumed that k is a unit vector (p2 norm = 1)'''
+    # https://medium.com/@sim30217/rodrigues-rotation-formula-47489db49050
+    cos_theta = torch.cos(theta)
+    sin_theta = torch.sin(theta)
+    v2 = v.view(-1,1)
+    term1 = v2 * cos_theta
+    term2 = torch.cross(k, v).view(-1, 1) * sin_theta
+    term3 = (k * (k @ v)).view(-1, 1) * (1 - cos_theta)
+    return (term1 + term2 + term3)
+# ########### End rotations
+def gt_in_norm_range(range,gt):
+    tmp = gt-range[0]
+    res = tmp / abs(range[1] - range[0])
+    return res
+    if range[0] > 0: # both positive
+        tmp = gt-range[0]
+        res = tmp / abs(range[1] - range[0])
+    elif range[1] > 0: # lower negative upper positive
+        if gt > 0:
+            tmp = gt-range[0]
+        else:
+            tmp = range[1]-gt
+        res = tmp / abs(range[1] - range[0])
+    else: # both negative
+        tmp = range[1]-gt
+        res = tmp / abs(range[1] - range[0])
+    return res
+def vectorized_linspace(start_tensor, end_tensor, number_of_steps):
+    # Calculate spacing
+    spacing = (end_tensor - start_tensor) / (number_of_steps - 1)
+    # Create linear spaces with arange
+    linear_spaces = torch.arange(start=0, end=number_of_steps, dtype=start_tensor.dtype, device=start_tensor.device)
+    linear_spaces = linear_spaces.repeat(start_tensor.size(0),1)
+    linear_spaces = linear_spaces * spacing[:,None] + start_tensor[:,None]
+    return linear_spaces
+##### Scoring
+def iou_2d(gt_box, proposal_boxes):
+    '''
+    gt_box: Boxes
+    proposal_box: Boxes
+    '''
+    IoU = pairwise_iou(gt_box,proposal_boxes).flatten()
+    return IoU
+def iou_3d(gt_cube, proposal_cubes):
+    """
+    Compute the Intersection over Union (IoU) of two 3D cubes.
+    Parameters:
+    - gt_cube: GT Cube.
+    - proposal_cube: List of Proposal Cubes.
+    Returns:
+    - iou: Intersection over Union (IoU) value.
+    """
+    gt_corners = gt_cube.get_all_corners()[0]
+    proposal_corners = proposal_cubes.get_all_corners()[0]
+    vol, iou = box3d_overlap(gt_corners,proposal_corners)
+    iou = iou[0]
+    return iou
+def custom_mapping(x,beta=1.7):
+    '''
+    maps the input curve to be S shaped instead of linear
+    Args:
+    beta: number > 1, higher beta is more aggressive
+    x: list of floats betweeen and including 0 and 1
+    beta: number > 1 higher beta is more aggressive
+    '''
+    mapped_list = []
+    for i in range(len(x)):
+        if x[i] <= 0:
+            mapped_list.append(0.0)
+        else:
+            mapped_list.append((1 / (1 + (x[i] / (1 - x[i])) ** (-beta))))
+    return mapped_list
+def mask_iou(segmentation_mask, bube_mask):
+    '''
+    Area is of segmentation_mask
+    '''
+    bube_mask = torch.tensor(bube_mask, device=segmentation_mask.device)
+    intersection = (segmentation_mask * bube_mask).sum()
+    if intersection == 0:
+        return torch.tensor(0.0)
+    union = torch.logical_or(segmentation_mask, bube_mask).to(torch.int).sum()
+    return intersection / union
+def mod_mask_iou(segmentation_mask, bube_mask):
+    '''
+    Area is of segmentation_mask
+    '''
+    bube_mask = torch.tensor(bube_mask, device=segmentation_mask.device)
+    intersection = (segmentation_mask * bube_mask).sum()
+    if intersection == 0:
+        return torch.tensor(0.0)
+    union = torch.logical_or(segmentation_mask, bube_mask).to(torch.int).sum()
+    return intersection**5 / union # NOTE not standard IoU
+def mask_iou_loss(segmentation_mask, bube_mask):
+    '''
+    Area is of segmentation_mask
+    '''
+    intersection = (segmentation_mask * bube_mask).sum()
+    if intersection == 0:
+        return torch.tensor(0.0)
+    union = torch.logical_or(segmentation_mask, bube_mask).to(torch.int).sum()
+    return intersection / union
+def is_gt_included(gt_cube,x_range,y_range,z_range, w_prior, h_prior, l_prior):
+    # Define how far away dimensions need to be to be counted as unachievable
+    stds_away = 1.5
+    # Center
+    because_of = []
+    if not (x_range[0] < gt_cube.center[0] < x_range[1]):
+        if (gt_cube.center[0] < x_range[0]):
+            val = abs(x_range[0] - gt_cube.center[0])
+        else:
+            val = abs(gt_cube.center[0] - x_range[1])
+        because_of.append(f'x by {val:.1f}')
+    if not (y_range[0] < gt_cube.center[1] < y_range[1]):
+        if (gt_cube.center[1] < y_range[0]):
+            val = abs(y_range[0] - gt_cube.center[1])
+        else:
+            val = abs(gt_cube.center[1] - y_range[1])
+        because_of.append(f'y by {val:.1f}')
+    # Depth
+    if not (z_range[0] < gt_cube.center[2] < z_range[1]):
+        if (gt_cube.center[2] < z_range[0]):
+            val = abs(z_range[0] - gt_cube.center[2])
+        else:
+            val = abs(gt_cube.center[2] - z_range[1])
+        because_of.append(f'z by {val:.1f}')
+    # Dimensions
+    if (gt_cube.dimensions[0] < w_prior[0]-stds_away*w_prior[1]):
+        because_of.append('w-')
+    if (gt_cube.dimensions[0] > w_prior[0]+stds_away*w_prior[1]):
+        because_of.append('w+')
+    if (gt_cube.dimensions[1] < h_prior[0]-stds_away*h_prior[1]):
+        because_of.append('h-')
+    if (gt_cube.dimensions[1] > h_prior[0]+stds_away*h_prior[1]):
+        because_of.append('h+')
+    if (gt_cube.dimensions[2] < l_prior[0]-stds_away*l_prior[1]):
+        because_of.append('l-')
+    if (gt_cube.dimensions[2] > l_prior[0]+stds_away*l_prior[1]):
+        because_of.append('l+')
+    if because_of == []:
+        return True
+    else:
+        print('GT cannot be found due to',because_of)
+        return False
+    # rotation nothing yet
+def euler_to_unit_vector(eulers):
+    """
+    Convert Euler angles to a unit vector.
+    """
+    yaw, pitch, roll = eulers
+    # Calculate the components of the unit vector
+    x = np.cos(yaw) * np.cos(pitch)
+    y = np.sin(yaw) * np.cos(pitch)
+    z = np.sin(pitch)
+    # Normalize the vector
+    length = np.sqrt(x**2 + y**2 + z**2)
+    unit_vector = np.array([x, y, z]) / length
+    return unit_vector
+# helper functions for plotting segmentation masks
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_mask2(masks:np.array, im:np.array, random_color=False):
+    """
+    Display the masks on top of the image.
+    Args:
+        masks (np.array): Array of masks with shape (h, w, 4).
+        im (np.array): Image with shape (h, w, 3).
+        random_color (bool, optional): Whether to use random colors for the masks. Defaults to False.
+    Returns:
+        np.array: Image with masks displayed on top.
+    """
+    im_expanded = np.concatenate((im, np.ones((im.shape[0],im.shape[1],1))*255), axis=-1)/255
+    mask_image = np.zeros((im.shape[0],im.shape[1],4))
+    for i, mask in enumerate(masks):
+        if isinstance(random_color, list):
+            color = random_color[i]
+        else:
+            color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+        h, w = mask.shape[-2:]
+        mask_sub = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+        mask_image = mask_image + mask_sub
+    mask_binary = (mask_image > 0).astype(bool)
+    im_out = im_expanded * ~mask_binary + (0.5* mask_image + 0.5 * (im_expanded * mask_binary))
+    im_out = im_out.clip(0,1)
+    return im_out
+def show_points(coords, labels, ax, marker_size=375):
+    pos_points = coords[labels==1]
+    neg_points = coords[labels==0]
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+def show_box(box, ax):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
+# Convex Hull
+import torch
+def direction(p1, p2, p3):
+    return (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0])
+def distance_sq(p1, p2):
+    return (p2[0] - p1[0])**2 + (p2[1] - p1[1])**2
+def findDuplicates(arr):
+    Len = len(arr)
+    ifPresent = False
+    a1 = []
+    idx = []
+    for i in range(Len - 1):
+        for j in range(i + 1, Len):
+            # Checking if element is present in the ArrayList or not if present then break
+            if torch.all(arr[i] == arr[j]):
+                # if len(a1) == 0:
+                #     a1 arr[i]
+                #     idx.append(i)
+                #     ifPresent = True
+                # else:
+                #     # if arr[i] in a1:
+                #     #     break
+                #     # # If element is not present in the ArrayList then add it to ArrayList and make ifPresent true
+                #     # else:
+                a1.append(arr[i])
+                idx.append(i)
+                ifPresent = True
+    if ifPresent:
+        return set(idx) # lazi inefficient implementation
+    else:
+        return None
+def jarvis_march(points):
+    '''https://algorithmtutor.com/Computational-Geometry/Convex-Hull-Algorithms-Jarvis-s-March/
+    https://algorithmtutor.com/Computational-Geometry/Determining-if-two-consecutive-segments-turn-left-or-right/ '''
+    # remove duplicates
+    duplicates = findDuplicates(points)
+    # this is necessary if there are > 2 duplicates of the same element
+    if duplicates is not None:
+        plusone = torch.zeros_like(points)
+        for i, d in enumerate(duplicates):
+            plusone[d] += i + 1
+        points = points + plusone
+    # find the lower left point
+    min_x = torch.min(points[:, 0])
+    candidates = (points[:, 0] == min_x).nonzero(as_tuple=True)[0]
+    # If there are multiple points, choose the one with the highest y value
+    if len(candidates) > 1:
+        index = candidates[torch.argmax(points[candidates][:, 1])]
+    else:
+        index = candidates[0]
+    a = points[index]
+    # selection sort
+    l = index
+    result = []
+    result.append(a)
+    while (True):
+        q = (l + 1) % len(points)
+        for i in range(len(points)):
+            if i == l:
+                continue
+            # find the greatest left turn
+            # in case of collinearity, consider the farthest point
+            d = direction(points[l], points[i], points[q])
+            if d > 0 or (d == 0 and distance_sq(points[i], points[l]) > distance_sq(points[q], points[l])):
+                q = i
+        l = q
+        if l == index:
+            break
+        result.append(points[q])
+    return torch.flip(torch.stack(result), [0,])
+def fill_polygon(mask, polygon):
+    '''
+    inspired by https://web.archive.org/web/20120323102807/http://local.wasp.uwa.edu.au/~pbourke/geometry/insidepoly/
+    '''
+    h, w = mask.shape
+    Y, X = torch.meshgrid(torch.arange(h), torch.arange(w), indexing='ij') # or xy??? xy is the numpy was
+    grid_coords = torch.stack([X.flatten(), Y.flatten()], dim=1).float().to(mask.device)
+    new_mask = torch.ones(h, w, device=mask.device)
+    zeros = torch.zeros(h, w, device=mask.device)
+    ones = torch.ones(h, w, device=mask.device)
+    # For some reason it is easier for me to comprehend the algorithm if we iterate counter-clockwise
+    for i in range(len(polygon)):
+        v1 = polygon[i]
+        v2 = polygon[(i + 1) % len(polygon)]
+        # Determine the direction of the edge
+        edge_direction = v2 - v1
+        # Given a line segment between P0 (x0,y0) and P1 (x1,y1), another point P (x,y) has the following relationship to the line segment.
+        # Compute
+        # (y - y0) (x1 - x0) - (x - x0) (y1 - y0)
+        # Check if the point is to the left of the edge
+        points = (grid_coords[:, 0] - v1[0]) * edge_direction[1] - (grid_coords[:, 1] - v1[1]) * edge_direction[0]
+        # we can do the threshold in a clever differentiable way
+        # this sets all values to be between 0 and 1
+        is_left = torch.min(torch.max(points.view(h, w), zeros), ones)
+        # do the intersection of the 2 masks, this progressily builds op the polygon
+        new_mask = new_mask * is_left
+    return new_mask
+def convex_hull(mask, coords):
+    hull = jarvis_march(coords)
+    new_mask = fill_polygon(mask, hull)
+    return new_mask
+if __name__ == '__main__':
+    import matplotlib.pyplot as plt
+    mask = torch.zeros(700, 700, dtype=torch.bool)
+    # p = torch.tensor([[5,6],[21.0,7],[21,20],[10,20],[15,20],[5,20],[11,8],[15,15],[17,6],[11,15]])
+    p = torch.tensor([[271.0000, 356.0000],
+                    [ 25.3744, 356.0000],
+                    [  0.0000, 356.0000],
+                    [  0.0000,  89.5266],
+                    [271.0000, 159.3112],
+                    [ 95.5653, 201.7484],
+                    [  0.0000,   0.0000],
+                    [271.0000,   0.0000]])
+    p2 = torch.tensor([[150.3456,   0.0000],
+                    [479.0000,   0.0000],
+                    [ 11.8427,   0.0000],
+                    [  0.0000,   0.0000],
+                    [121.4681, 232.5976],
+                    [375.6230, 383.9329],
+                    [ 12.8765, 630.0000],
+                    [  0.0000, 344.7250]])
+    p3 = torch.tensor([[290.9577, 171.1176],
+                    [197.7348, 483.7612],
+                    [383.0000, 504.0000],
+                    [383.0000,  27.6211],
+                    [  2.2419,  52.6505],
+                    [  0.0000, 399.6908],
+                    [  0.0000, 504.0000],
+                    [  0.0000,   0.0000]])
+    p4 = torch.tensor([[271.0000,  19.5241],
+                    [271.0000, 356.0000],
+                    [  0.0000,   0.0000],
+                    [271.0000,   0.0000],
+                    [  0.0000,   0.0000],
+                    [163.0264,  77.9408],
+                    [164.2467, 321.0222],
+                    [  0.0000, 356.0000],
+                    [  0.0000,   0.0000]])
+    p5 = torch.tensor([[272.0000,   1.0000],
+                    [  0.0000, 173.5156],
+                    [ 74.8860, 141.3913],
+                    [253.8221,   0.0000],
+                    [271.0000,   0.0000],
+                    [271.0000, 356.0000],
+                    [262.5294, 327.9978],
+                    [271.0000, 120.8048]])
+    mask5 = convex_hull(mask, p5)
+    mask4 = convex_hull(mask, p4)
+    mask1 = convex_hull(mask, p)
+    mask2 = convex_hull(mask, p2)
+    mask3 = convex_hull(mask, p3)
+    fig, ax = plt.subplots(1,5, figsize=(20,5))
+    ax[0].scatter(p[:,0], p[:,1], c='r')
+    ax[1].scatter(p2[:,0], p2[:,1], c='b')
+    ax[2].scatter(p3[:,0], p3[:,1], c='g')
+    ax[3].scatter(p4[:,0], p4[:,1], c='y')
+    ax[4].scatter(p5[:,0], p5[:,1], c='m')
+    ax[0].imshow(mask1)
+    ax[1].imshow(mask2)
+    ax[2].imshow(mask3)
+    ax[3].imshow(mask4)
+    ax[4].imshow(mask5)
+    plt.show()
+    a = 2

README.md CHANGED Viewed

@@ -1,12 +1,11 @@
 ---
 title: Weak Cube RCNN
-emoji: ⚡
 colorFrom: indigo
-colorTo: purple
 sdk: docker
 pinned: false
-license: cc-by-nc-sa-4.0
-short_description: Weak Cube RCNN model
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Weak Cube RCNN
+emoji: 🎲
 colorFrom: indigo
+colorTo: yellow
 sdk: docker
 pinned: false
+license: apache-2.0
 ---
+https://github.com/AndreasLH/Weak-Cube-R-CNN

VisualiseGT.py ADDED Viewed

	@@ -0,0 +1,830 @@

+from pycocotools.coco import COCO
+import os
+import random
+from functools import reduce
+from io import StringIO
+from detectron2.utils.visualizer import Visualizer
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from scipy import stats
+from cubercnn import data, util, vis
+from cubercnn.config import get_cfg_defaults
+from cubercnn.data.build import (build_detection_test_loader,
+                                 build_detection_train_loader)
+from cubercnn.data.dataset_mapper import DatasetMapper3D
+from cubercnn.data.datasets import load_omni3d_json, simple_register
+from detectron2.config import get_cfg
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures.boxes import BoxMode
+from detectron2.utils.logger import setup_logger
+color = '#384860'
+second_color = '#97a6c4'
+def load_gt(dataset='SUNRGBD', mode='test', single_im=True, filter=False, img_idx=150):
+    # we can do this block of code to get the categories reduced number of categories in the sunrgbd dataset as there normally is 83 categories, however we only work with 38.
+    config_file = 'configs/Base_Omni3D.yaml'
+    if filter:
+        cfg, filter_settings = get_config_and_filter_settings(config_file)
+    else:
+        filter_settings = None
+    if mode == 'test':
+        dataset_paths_to_json = ['datasets/Omni3D/'+dataset+'_test.json']
+    elif mode == 'train':
+        dataset_paths_to_json = ['datasets/Omni3D/'+dataset+'_train.json']
+    # Get Image and annotations
+    try:
+        dataset = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
+    except:
+        print('Dataset does not exist or is not in the correct format!')
+        exit()
+    imgIds = dataset.getImgIds()
+    imgs = dataset.loadImgs(imgIds)
+    if single_im:
+        # img = random.choice(imgs)
+        # 730 and 150 are used in the report
+        img = imgs[img_idx]
+        annIds = dataset.getAnnIds(imgIds=img['id'])
+    else:
+        # get all annotations
+        img = imgs
+        annIds = dataset.getAnnIds()
+    anns = dataset.loadAnns(annIds)
+    # Extract necessary annotations
+    R_cams = []
+    center_cams = []
+    dimensions_all = []
+    cats = []
+    bboxes = []
+    for instance in anns:
+        if 'bbox2D_tight' in instance and instance['bbox2D_tight'][0] != -1:
+            bboxes.append(instance['bbox2D_tight']) # boxes are XYXY_ABS by default
+        elif 'bbox2D_trunc' in instance and not np.all([val==-1 for val in instance['bbox2D_trunc']]):
+            bboxes.append(instance['bbox2D_trunc']) # boxes are XYXY_ABS by default
+        elif 'bbox2D_proj' in instance:
+            bboxes.append(instance['bbox2D_proj']) # boxes are XYXY_ABS by default
+        else:
+            continue
+        R_cams.append(instance['R_cam'])
+        center_cams.append(instance['center_cam'])
+        dimensions_all.append(instance['dimensions'])
+        cats.append(instance['category_name'])
+    return img, R_cams, center_cams, dimensions_all, cats, bboxes
+def plot_scene(image_path, output_dir, center_cams, dimensions_all, Rs, K, cats, bboxes):
+    # TODO: currently this function does not filter out invalid annotations, but it should have the option to do so.
+    # Compute meshes
+    meshes = []
+    meshes_text = []
+    for idx, (center_cam, dimensions, pose, cat) in enumerate(zip(
+            center_cams, dimensions_all, Rs, cats
+        )):
+        bbox3D = center_cam + dimensions
+        meshes_text.append('{}'.format(cat))
+        color = [c/255.0 for c in util.get_color(idx)]
+        box_mesh = util.mesh_cuboid(bbox3D, pose, color=color)
+        meshes.append(box_mesh)
+    image_name = util.file_parts(image_path)[1]
+    print('File: {} with {} dets'.format(image_name, len(meshes)))
+    np.random.seed(0)
+    colors = [np.concatenate([np.random.random(3), np.array([0.6])], axis=0) for _ in range(len(meshes))]
+    # Plot
+    image = util.imread('datasets/'+image_path)
+    if len(meshes) > 0:
+        im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(image, np.array(K), meshes, colors=colors, text=meshes_text, scale=image.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
+        if False:
+            im_concat = np.concatenate((im_drawn_rgb, im_topdown), axis=1)
+            vis.imshow(im_concat)
+        util.imwrite(im_drawn_rgb, os.path.join(output_dir, image_name+'_boxes.jpg'))
+        util.imwrite(im_topdown, os.path.join(output_dir, image_name+'_novel.jpg'))
+        v_pred = Visualizer(image, None)
+        #bboxes = [[320, 150, 560, 340]] # low loss
+        #bboxes = [[350, 220, 440, 290]] # high loss
+        #bboxes = [[340, 163, 540, 297]] # fail loss
+        v_pred = v_pred.overlay_instances(boxes=np.array(bboxes), assigned_colors=colors)#[np.array([0.5,0,0.5])])#colors)
+        util.imwrite(v_pred.get_image(), os.path.join(output_dir, image_name+'_pred_boxes.jpg'))
+        #im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(v_pred.get_image(), np.array(K), meshes, colors=colors, text=meshes_text, scale=image.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
+        #util.imwrite(im_drawn_rgb, os.path.join(output_dir, image_name+'_boxes_with_2d.jpg'))
+    else:
+        print('No meshes')
+        util.imwrite(image, os.path.join(output_dir, image_name+'_boxes.jpg'))
+def show_data(dataset, filter_invalid=False, output_dir='output/playground'):
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, filter=filter_invalid)
+    # Create Output Directory
+    util.mkdir_if_missing(output_dir)
+    plot_scene(image['file_path'], output_dir, center_cams, dimensions_all, Rs, image['K'], cats, bboxes)
+def category_distribution(dataset):
+    '''Plot a histogram of the category distribution in the dataset.'''
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
+    image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
+    config_file = 'configs/Base_Omni3D.yaml'
+    cfg, filter_settings = get_config_and_filter_settings(config_file)
+    annotation_file = 'datasets/Omni3D/SUNRGBD_train.json'
+    coco_api = COCO(annotation_file)
+    meta = MetadataCatalog.get('SUNRGBD')
+    cat_ids = sorted(coco_api.getCatIds(filter_settings['category_names']))
+    cats_sun = coco_api.loadCats(cat_ids)
+    thing_classes = [c["name"] for c in sorted(cats_sun, key=lambda x: x["id"])]
+    output_dir = 'output/figures/' + dataset
+    util.mkdir_if_missing(output_dir)
+    # histogram of categories
+    cats_all = cats + cats_t
+    # cats_unique = list(set(cats_all))
+    cats_unique = thing_classes
+    print('cats unique: ', len(cats_unique))
+    # make dict with count of each category
+    cats_count = {cat: cats_all.count(cat) for cat in cats_unique}
+    cats_sorted = dict(sorted(cats_count.items(), key=lambda x: x[1], reverse=True))
+    plt.figure(figsize=(14,5))
+    plt.bar(cats_sorted.keys(), cats_sorted.values())
+    plt.xticks(rotation=60, size=9)
+    plt.title('Category Distribution')
+    plt.savefig(os.path.join(output_dir, 'category_distribution.png'),dpi=300, bbox_inches='tight')
+    plt.close()
+    return cats_sorted
+def spatial_statistics(dataset):
+    '''Compute spatial statistics of the dataset.
+    wanted to reproduce fig. 7 from the omni3D paper
+    however, we must standardise the images for it to work
+    '''
+    # Load Image and Ground
+    # this function filters out invalid images if there are no valid annotations in the image
+    # annnotations in each image can also be marked as is_ignore => True
+    image_root = 'datasets'
+    cfg, filter_settings = get_config_and_filter_settings()
+    dataset_names = ['SUNRGBD_train','SUNRGBD_test','SUNRGBD_val']
+    output_dir = 'output/figures/' + dataset
+    # this is almost the same as the simple_register function, but it also stores the model metadata
+    # which is needed for the load_omni3d_json function
+    data.register_and_store_model_metadata(None, output_dir, filter_settings=filter_settings)
+    data_dicts = []
+    for dataset_name in dataset_names:
+        json_file = 'datasets/Omni3D/'+dataset_name+'.json'
+        data_dict = load_omni3d_json(json_file, image_root, dataset_name, filter_settings, filter_empty=True)
+        data_dicts.extend(data_dict)
+    # standardise the images to a fixed size
+    # and map the annotations to the standardised images
+    std_image_size = (480//4, 640//4)
+    tot_outliers = 0
+    img = np.zeros(std_image_size)
+    for img_dict in data_dicts:
+        original_width = img_dict['width']
+        original_height = img_dict['height']
+        # Calculate the scale factor for resizing
+        scale_x = std_image_size[1] / original_width
+        scale_y = std_image_size[0] / original_height
+        # Update the image size in the annotation
+        img_dict['width'] = std_image_size[1]
+        img_dict['height'] = std_image_size[0]
+        for anno in img_dict['annotations']:
+            if not anno['ignore']:
+                # Update the 2D box coordinates (boxes are XYWH)
+                anno['bbox2D_tight'][0] *= scale_x
+                anno['bbox2D_tight'][1] *= scale_y
+                anno['bbox2D_tight'][2] *= scale_x
+                anno['bbox2D_tight'][3] *= scale_y
+                # get the centerpoint of the annotation as (x, y)
+                # x0, y0, x1, y1 = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+                x0, y0, x1, y1 = anno['bbox2D_tight']
+                x_m, y_m = int((x0+x1)/2), int((y0+y1)/2)
+                if x_m >= std_image_size[1] or x_m < 0:
+                    # print(f'x out of line {x_m}')
+                    tot_outliers += 1
+                elif y_m >= std_image_size[0] or y_m < 0:
+                    # print(f'y out of line {y_m}')
+                    tot_outliers += 1
+                else:
+                    img[y_m, x_m] += 1
+            else:
+                # Remove the annotation if it is marked as ignore
+                img_dict['annotations'].remove(anno)
+    print('num center points outside frame: ', tot_outliers)
+    img = img/img.max()
+    # this point is so large that all the points become invisible, so I remove it.
+    img[0,0] = 0.00
+    img = img/img.max()
+    plt.figure()
+    plt.imshow(img, cmap='gray_r', vmin=0, vmax=1)
+    plt.xticks([]); plt.yticks([])
+    plt.title('Histogram of 2D box centre points')
+    # plt.box(False)
+    plt.savefig(os.path.join(output_dir, '2d_histogram.png'),dpi=300, bbox_inches='tight')
+    plt.close()
+    return
+def AP_vs_no_of_classes(dataset, files:list=['output/Baseline_sgd/log.txt','output/omni_equalised/log.txt','output/omni_pseudo_gt/log.txt','output/proposal_AP/log.txt','output/exp_10_iou_zpseudogt_dims_depthrange_rotalign_ground/log.txt']):
+    '''Search the log file for the precision numbers corresponding to the last iteration
+    then parse it in as a pd.DataFrame and plot the AP vs number of classes'''
+    # search the file from the back until the line
+    # cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:
+    # is found
+    target_line = "cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:"
+    model_names = ['Base Cube R-CNN', 'Time-eq.', 'Pseudo GT', 'Proposal', 'Weak loss']
+    df = []
+    for file, model_name in zip(files, model_names):
+        df_i = search_file_backwards(file, target_line).rename(columns={'AP3D':f'{model_name} AP3D', 'AP2D':f'{model_name} AP2D'})
+        assert df_i is not None, 'df not found'
+        df.append(df_i)
+        # merge df's
+    df = reduce(lambda x, y: pd.merge(x, y, on = 'category'), df)
+    # sort df by ap3d of model 1
+    df = df.sort_values(by='Base Cube R-CNN AP3D', ascending=False)
+    cats = category_distribution(dataset)
+    df.sort_values(by='category', inplace=True)
+    cats = dict(sorted(cats.items()))
+    merged_df = pd.merge(df.reset_index(), pd.DataFrame(cats.values(), columns=['cats']), left_index=True, right_index=True)
+    merged_df = merged_df.sort_values(by='cats')
+    merged_df = merged_df.drop('index',axis=1)
+    merged_df = merged_df.reset_index(drop=True)
+    fig, ax = plt.subplots(figsize=(12,8))
+    for model_name in model_names:
+        if model_name == 'Base Cube R-CNN':
+            scale = 114
+        else:
+            scale = 10.15
+        # convert the annotation time to hours
+        time = merged_df['cats']*scale / 60 / 60
+        ax.scatter(time, merged_df[f'{model_name} AP3D'].values, s=merged_df[f'{model_name} AP2D'].values*2, alpha=0.5, label=model_name)
+        for i, txt in enumerate(merged_df['category']):
+            ax.text(time[i], merged_df[f'{model_name} AP3D'].values[i], txt, fontsize=merged_df[f'{model_name} AP3D'].values[i]*0.3+3)
+        correlation_coef = np.corrcoef(time, merged_df[f'{model_name} AP3D'].values)[0, 1]
+        line_fit = np.polyfit(time, merged_df[f'{model_name} AP3D'].values, 1)
+        # plot the line of best fit
+        ax.plot(time, np.poly1d(line_fit)(time), linestyle='--',alpha=0.5, label=f'Linear fit (R={correlation_coef:.2f})')
+    # Set labels and title
+    ax.set_xlabel('Annotation time (h)')
+    ax.set_ylabel('AP3D')
+    ax.set_xscale('log')
+    ax.set_title('AP3D vs class-wise annotation time')
+    ax.legend(title='AP3D scaled by AP2D')
+    # Save the plot
+    plt.savefig('output/figures/'+dataset+'/AP_vs_no_of_classes_all.png', dpi=300, bbox_inches='tight')
+    plt.close()
+    return
+def AP3D_vs_AP2D(dataset, mode = 'standard', files=['output/Baseline_sgd/log.txt','output/omni_equalised/log.txt','output/omni_pseudo_gt/log.txt','output/proposal_AP/log.txt','output/exp_10_iou_zpseudogt_dims_depthrange_rotalign_ground/log.txt']):
+    '''Search the log file for the precision numbers corresponding to the last iteration
+    then parse it in as a pd.DataFrame and plot the AP vs number of classes'''
+    # search the file from the back until the line
+    # cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:
+    # is found
+    target_line = "cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:"
+    model_names = ['Base Cube R-CNN', 'Time-eq.', 'Pseudo GT', 'Proposal', 'Weak loss']
+    df = []
+    for file, model_name in zip(files, model_names):
+        df_i = search_file_backwards(file, target_line).rename(columns={'AP3D':f'{model_name} AP3D', 'AP2D':f'{model_name} AP2D'})
+        assert df_i is not None, 'df not found'
+        df.append(df_i)
+        # merge df's
+    df = reduce(lambda x, y: pd.merge(x, y, on = 'category'), df)
+    # sort df by ap3d of model 1
+    df = df.sort_values(by='Base Cube R-CNN AP3D', ascending=False)
+    cats = category_distribution(dataset)
+    df.sort_values(by='category', inplace=True)
+    cats = dict(sorted(cats.items()))
+    merged_df = pd.merge(df.reset_index(), pd.DataFrame(cats.values(), columns=['cats']), left_index=True, right_index=True)
+    merged_df = merged_df.sort_values(by='cats')
+    merged_df = merged_df.drop('index',axis=1)
+    merged_df = merged_df.reset_index(drop=True)
+    # mode = 'standard' # 'log'
+    fig, ax = plt.subplots(figsize=(12,8))
+    for model_name in model_names:
+        if mode == 'standard': s=merged_df[f'{model_name} AP2D'].values*2
+        else: s = None
+        # we have to add 0.001 to the values to avoid log(0) errors
+        ax.scatter(merged_df[f'{model_name} AP2D'].values+0.001, merged_df[f'{model_name} AP3D'].values+0.001, alpha=0.5, label=model_name, s=s)
+        for i, txt in enumerate(merged_df['category']):
+            if mode == 'standard': fontsize=merged_df[f'{model_name} AP3D'].values[i]*0.3+3
+            else: fontsize=7
+            ax.text(merged_df[f'{model_name} AP2D'].values[i]+0.001, merged_df[f'{model_name} AP3D'].values[i]+0.001, txt,fontsize=fontsize)
+    # plot average line
+    ax.plot((0, 70), (0, 70), linestyle='--', color=color, alpha=0.3, label=f'AP2D=AP3D')
+    # Set labels and title
+    if mode == 'log':
+        ax.set_xscale('log')
+        ax.set_yscale('log')
+    ax.set_xlabel('AP2D')
+    ax.set_ylabel('AP3D')
+    # ax.set_xlim(0.1, 75); ax.set_ylim(0.1, 75)
+    ax.set_title('AP in 3D vs AP in 2D')
+    ax.legend()
+    # if mode == 'log':
+    #     # for some obscure reason the log plot fails to save
+    #     plt.show()
+    # # Save the plot
+    # else:
+    plt.savefig('output/figures/'+dataset+f'/AP3D_vs_AP2D_all_{mode}.png', dpi=300, bbox_inches='tight')
+    plt.close()
+    return
+def search_file_backwards(file_path:str, target_line:str) -> pd.DataFrame:
+    '''Search a file backwards for a target line and return the table of the performance of the model. The point of this is to parse the part of the log file that looks like this
+    |  category  | AP2D    | AP3D      |  category   | AP2D     | AP3D     |   category   | AP2D      | AP3D       |
+    |:----------:|:--------|:----------|:-----------:|:---------|:---------|:------------:|:----------|:-----------|
+    |   chair    | 45.9374 | 53.4913   |    table    | 34.5982  | 39.7769  |   cabinet    | 16.3693   | 14.0878    |
+    |    lamp    | 24.8081 | 7.67653   |    books    | 0.928978 | 0.599711 |     sofa     | 49.2354   | 57.9649    |
+    ...
+    To a pandas DataFrame that has 3 columns: category, AP2D, AP3D'''
+    import re
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+        for i, line in enumerate(reversed(lines)):
+            is_found = re.search(f'.*{target_line}$', line)
+            if is_found:
+                table = lines[-i:-i+15]
+                tab_as_str= ' '.join(table)
+                # i know this is really ugly
+                df = pd.read_csv( StringIO(tab_as_str.replace(' ', '')),  # Get rid of whitespaces
+                    sep='|',).dropna(axis=1, how='all').drop(0)
+                # https://stackoverflow.com/a/65884212
+                df.columns = pd.MultiIndex.from_frame(df.columns.str.split('.', expand=True)
+                                        .to_frame().fillna('0'))
+                df = df.stack().reset_index(level=1, drop=True).reset_index().drop('index', axis=1)
+                df['AP3D'] = df['AP3D'].astype(float)
+                df['AP2D'] = df['AP2D'].astype(float)
+                return df
+    return None
+def get_config_and_filter_settings(config_file='configs/Base_Omni3D.yaml'):
+    # we must load the config file to get the filter settings
+    cfg = get_cfg()
+    get_cfg_defaults(cfg)
+    cfg.merge_from_file(config_file)
+    # must setup logger to get info about filtered out annotations
+    setup_logger(output=cfg.OUTPUT_DIR, name="cubercnn")
+    filter_settings = data.get_filter_settings_from_cfg(cfg)
+    return cfg, filter_settings
+def init_dataloader():
+    ''' dataloader stuff.
+    currently not used anywhere, because I'm not sure what the difference between the omni3d dataset and load omni3D json functions are. this is a 3rd alternative to this. The train script calls something similar to this.'''
+    cfg, filter_settings = get_config_and_filter_settings()
+    dataset_names = ['SUNRGBD_train','SUNRGBD_val']
+    dataset_paths_to_json = ['datasets/Omni3D/'+dataset_name+'.json' for dataset_name in dataset_names]
+    for dataset_name in dataset_names:
+        simple_register(dataset_name, filter_settings, filter_empty=True)
+    # Get Image and annotations
+    datasets = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
+    data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
+    thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+    dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+    infos = datasets.dataset['info']
+    dataset_id_to_unknown_cats = {}
+    possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
+    dataset_id_to_src = {}
+    for info in infos:
+        dataset_id = info['id']
+        known_category_training_ids = set()
+        if not dataset_id in dataset_id_to_src:
+            dataset_id_to_src[dataset_id] = info['source']
+        for id in info['known_category_ids']:
+            if id in dataset_id_to_contiguous_id:
+                known_category_training_ids.add(dataset_id_to_contiguous_id[id])
+        # determine and store the unknown categories.
+        unknown_categories = possible_categories - known_category_training_ids
+        dataset_id_to_unknown_cats[dataset_id] = unknown_categories
+    from detectron2 import data as d2data
+    NoOPaug = d2data.transforms.NoOpTransform()
+    # def NoOPaug(input):
+        # return input
+    # TODO: how to load in images without having them resized?
+    # data_mapper = DatasetMapper3D(cfg, augmentations=[NoOPaug], is_train=True)
+    data_mapper = DatasetMapper3D(cfg, is_train=True)
+    # test loader does resize images, like the train loader does
+    # this is the function that filters out the invalid annotations
+    data_loader = build_detection_train_loader(cfg, mapper=data_mapper, dataset_id_to_src=dataset_id_to_src, num_workers=1)
+    # data_loader = build_detection_test_loader(cfg, dataset_names[1], num_workers=1)
+    # this is a detectron 2 thing that we just have to do
+    data_mapper.dataset_id_to_unknown_cats = dataset_id_to_unknown_cats
+    for item in data_loader:
+        print(item)
+def vol_over_cat(dataset):
+    '''
+    Errorbarplot of volume of object category
+    '''
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
+    image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
+    output_dir = 'output/figures/' + dataset
+    util.mkdir_if_missing(output_dir)
+    # histogram of categories
+    cats_all = cats + cats_t
+    cats_unique = list(set(cats_all))
+    # Create dictionary with np.prod(dimensions) for each category
+    cats_vol = {cat: [] for cat in cats_unique}
+    for cat, dims in zip(cats, dimensions_all):
+        if np.prod(dims) > 0:
+            cats_vol[cat].append(np.prod(dims))
+    for cat, dims in zip(cats_t, dimensions_all_t):
+        if np.prod(dims) > 0:
+            cats_vol[cat].append(np.prod(dims))
+    # make dict with mean and std of each category
+    cats_mean = {cat: np.mean(cats_vol[cat]) for cat in cats_unique}
+    cats_error = {cat: np.std(cats_vol[cat]) for cat in cats_unique}
+    keys = np.array(list(cats_mean.keys()))
+    means = np.array(list(cats_mean.values()))
+    errors = np.array(list(cats_error.values()))
+    # Calculate Z-scores for 5th and 95th percentiles
+    from scipy.stats import norm
+    z_lower = norm.ppf(0.05)
+    z_upper = norm.ppf(0.95)
+    bounds = []
+    for mean, std in zip(means, errors):
+        # Calculate the lower and upper bounds of the interval
+        lower_bound = mean + z_lower * std
+        upper_bound = mean + z_upper * std
+        bounds.append((max(0,lower_bound), upper_bound))
+    plt.figure(figsize=(14,5))
+    for i, (mean, (lower_bound, upper_bound)) in enumerate(zip(means, bounds)):
+        plt.vlines(x=i, ymin=lower_bound, ymax=upper_bound, color='gray', linewidth=2)
+        plt.plot([i], [mean], marker='o', color=color)
+    plt.xticks(np.arange(len(keys)), keys, rotation=60, size=9)
+    plt.xlabel('Category')
+    plt.ylabel('Volume')
+    plt.title('Category Distribution')
+    plt.savefig(os.path.join(output_dir, 'volume_distribution.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+def gt_stats(dataset):
+    '''
+    Errorbarplot of volume of object category
+    '''
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
+    image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
+    output_dir = 'output/figures/' + dataset
+    util.mkdir_if_missing(output_dir)
+    # histogram of centers
+    center_all = center_cams + center_cams_t
+    center_all = np.transpose(np.array(center_all))
+    # Filter -1 annotations
+    valid_columns = center_all[0] != -1
+    center_all = center_all[:,valid_columns]
+    x_label = ['x', 'y', 'z']
+    fig, axes = plt.subplots(1, len(center_all), figsize=(18, 5))
+    for i in range(len(center_all)):
+        axes[i].hist(center_all[i], color=color, bins=20)
+        axes[i].set_xlabel(x_label[i])
+        axes[i].set_ylabel('Count')
+    fig.suptitle('Center Distribution in Meters')
+    plt.savefig(os.path.join(output_dir, 'center.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+    # histogram of dimensions
+    dimensions_all = dimensions_all + dimensions_all_t
+    dimensions_all = np.transpose(np.array(dimensions_all))
+    # Filter -1 annotations
+    valid_columns = dimensions_all[0] != -1
+    dimensions_all = dimensions_all[:,valid_columns]
+    x_label = ['w', 'h', 'l']
+    fig, axes = plt.subplots(1, len(dimensions_all), figsize=(18, 5))
+    for i in range(len(dimensions_all)):
+        axes[i].hist(dimensions_all[i], color=color, bins=20)
+        axes[i].set_xlabel(x_label[i])
+        axes[i].set_ylabel('Count')
+    fig.suptitle('Dimensions Distribution in Meters')
+    plt.savefig(os.path.join(output_dir, 'dimensions.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+def report_figures(dataset, filter_invalid=False, output_dir='output/report_images'):
+    # Create Output Directory
+    util.mkdir_if_missing(output_dir)
+    util.mkdir_if_missing(output_dir+'/low_green')
+    util.mkdir_if_missing(output_dir+'/high_green')
+    util.mkdir_if_missing(output_dir+'/fail_green')
+    util.mkdir_if_missing(output_dir+'/low_red')
+    util.mkdir_if_missing(output_dir+'/high_red')
+    util.mkdir_if_missing(output_dir+'/fail_red')
+    util.mkdir_if_missing(output_dir+'/low_blue')
+    util.mkdir_if_missing(output_dir+'/high_blue')
+    util.mkdir_if_missing(output_dir+'/fail_blue')
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, filter=filter_invalid, img_idx=352)
+    gt_center = center_cams[1:]
+    gt_dim = dimensions_all[1:]
+    gt_Rs = Rs[1:]
+    cats = cats[1:]
+    gt_bb = bboxes[1:]
+    # Make low loss boxes for IoU, ps. z and proj
+    center = gt_center[-1]
+    dim = gt_dim[-1]
+    R = gt_Rs[-1]
+    cat = cats[-1]
+    bb = gt_bb[-1]
+    plot_scene(image['file_path'], output_dir+'/low_green', [center], [dim], [R], image['K'], [cat], [bb])
+    # Make high loss boxes for IoU, ps. z and proj
+    center = [gt_center[-1][0],gt_center[-1][1],gt_center[-1][2]+3]
+    dim = gt_dim[-1]
+    R = gt_Rs[-1]
+    cat = cats[-1]
+    bb = gt_bb[-1]
+    plot_scene(image['file_path'], output_dir+'/high_green', [center], [dim], [R], image['K'], [cat], [bb])
+    # Make fail loss boxes for IoU, ps. z and proj
+    center = [gt_center[-1][0]-0.03,gt_center[-1][1],gt_center[-1][2]]
+    dim = [0.05,0.71,0.05]
+    R = util.euler2mat(np.array([0,0,45]))
+    cat = cats[-1]
+    bb = gt_bb[-1]
+    plot_scene(image['file_path'], output_dir+'/fail_green', [center], [dim], [R], image['K'], [cat], [bb])
+    # Make low loss boxes for range and seg
+    center = gt_center[0]
+    dim = gt_dim[0]
+    R = gt_Rs[0]
+    cat = cats[0]
+    bb = gt_bb[0]
+    plot_scene(image['file_path'], output_dir+'/low_red', [center], [dim], [R], image['K'], [cat], [bb])
+    # Make high loss boxes for range and seg
+    center = [gt_center[0][0],gt_center[0][1]+0.3,gt_center[0][2]]
+    dim = [gt_dim[0][0]+1.5,gt_dim[0][1]-0.6,gt_dim[0][2]]
+    R = gt_Rs[0]
+    cat = cats[0]
+    bb = gt_bb[0]
+    plot_scene(image['file_path'], output_dir+'/high_red', [center], [dim], [R], image['K'], [cat], [bb])
+    # Make fail loss boxes for range and seg
+    center = [gt_center[0][0]+0.25,gt_center[0][1],gt_center[0][2]]
+    dim = [gt_dim[0][0]+0.7,gt_dim[0][1],gt_dim[0][2]]
+    R = gt_Rs[-1]
+    cat = cats[-1]
+    bb = gt_bb[-1]
+    plot_scene(image['file_path'], output_dir+'/fail_red', [center], [dim], [R], image['K'], [cat], [bb])
+    # Make low loss boxes for dim, pose and align
+    center = gt_center[1:]
+    dim = [[gt_dim[1][0]*1.5,gt_dim[1][1],gt_dim[1][2]*1.5], gt_dim[2]]
+    R = gt_Rs[1:]
+    cat = cats[1:]
+    bb = gt_bb[1:]
+    plot_scene(image['file_path'], output_dir+'/low_blue', center, dim, R, image['K'], cat, bb)
+    # Make high loss boxes for dim, pose and align
+    center = gt_center[1:]
+    dim = gt_dim[1:]
+    R = [util.euler2mat(util.mat2euler(np.array(gt_Rs[1]))+[20,0,0]), util.euler2mat(util.mat2euler(np.array(gt_Rs[2]))+[-20,0,0])]
+    cat = cats[1:]
+    bb = gt_bb[1:]
+    plot_scene(image['file_path'], output_dir+'/high_blue', center, dim, R, image['K'], cat, bb)
+    # Make fail loss boxes for dim, pose and align
+    center = gt_center[1:]
+    dim = [[gt_dim[1][0],gt_dim[1][1],gt_dim[1][2]],[gt_dim[2][1],gt_dim[2][0],gt_dim[2][2]]]
+    R = [util.euler2mat(util.mat2euler(np.array(gt_Rs[1]))+[1,0,0]), util.euler2mat(util.mat2euler(np.array(gt_Rs[2]))+[1,0,0])]
+    cat = cats[1:]
+    bb = gt_bb[1:]
+    plot_scene(image['file_path'], output_dir+'/fail_blue', center, dim, R, image['K'], cat, bb)
+    return True
+def gt_stats_in_terms_of_sigma(dataset):
+    '''
+    Errorbarplot of volume of object category
+    '''
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
+    image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
+    output_dir = 'output/figures/' + dataset
+    util.mkdir_if_missing(output_dir)
+    # histogram of centers
+    center_all = center_cams + center_cams_t
+    center_all = np.transpose(np.array(center_all))
+    # Filter -1 annotations
+    valid_columns = center_all[0] != -1
+    center_all = center_all[:,valid_columns]
+    x_label = ['x', 'y', 'z']
+    fig, axes = plt.subplots(1, len(center_all), figsize=(18, 5))
+    for i in range(len(center_all)):
+        axes[i].hist(center_all[i], color=color, bins=20)
+        axes[i].set_xlabel(x_label[i])
+        axes[i].set_ylabel('Count')
+    fig.suptitle('Center Distribution in Meters')
+    plt.savefig(os.path.join(output_dir, 'center.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+    # histogram of dimensions
+    dimensions_all = dimensions_all + dimensions_all_t
+    dimensions_all = np.transpose(np.array(dimensions_all))
+    # Filter -1 annotations
+    valid_columns = dimensions_all[0] != -1
+    dimensions_all = dimensions_all[:,valid_columns]
+    x_label = ['w', 'h', 'l']
+    fig, axes = plt.subplots(1, len(dimensions_all), figsize=(18, 5))
+    for i in range(len(dimensions_all)):
+        axes[i].hist(dimensions_all[i], color=color, bins=20, density=True)
+        # Plot normal distribution
+        mu, sigma = np.mean(dimensions_all[i]), np.std(dimensions_all[i])
+        x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)
+        axes[i].plot(x, stats.norm.pdf(x, mu, sigma))
+        y_lim = axes[i].get_ylim()[1]
+        axes[i].vlines(mu+sigma, 0, y_lim, linestyle='--', label='$\sigma$', color='gray')
+        axes[i].vlines(mu-sigma, 0, y_lim, linestyle='--', label='$\sigma$', color='gray')
+        axes[i].vlines(1.4, 0, y_lim, linestyle='--', color='red', label='pred')
+        if i != 0:
+            axes[i].plot((mu+sigma,1.4), (y_lim/2,y_lim/2), color='c', label='loss')
+        axes[i].set_xlabel(x_label[i])
+        axes[i].set_ylabel('density')
+        # Set xticks in terms of sigma
+        xticks = [mu - 3 * sigma, mu - 2 * sigma, mu - sigma, mu, mu + sigma, mu + 2 * sigma, mu + 3 * sigma, mu + 4 * sigma, mu + 5 * sigma, mu + 6 * sigma]
+        xticklabels = ['-3$\sigma$', '-2$\sigma$', '-$\sigma$', '0', '$\sigma$', '$2\sigma$', '$3\sigma$', '$4\sigma$', '$5\sigma$', '$6\sigma$']
+        axes[i].set_xticks(xticks)
+        axes[i].set_xticklabels(xticklabels)
+    axes[-1].legend()
+    fig.suptitle('Dimensions Distribution in Meters')
+    plt.savefig(os.path.join(output_dir, 'dimensions_sigma.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+    return True
+def parallel_coordinate_plot(dataset='SUNRGBD', files:list=['output/Baseline_sgd/log.txt','output/omni_equalised/log.txt','output/omni_pseudo_gt/log.txt','output/proposal_AP/log.txt','output/exp_10_iou_zpseudogt_dims_depthrange_rotalign_ground/log.txt']):
+    '''Search the log file for the precision numbers corresponding to the last iteration
+    then parse it in as a pd.DataFrame and plot the AP vs number of classes'''
+    import plotly.graph_objects as go
+    # df with each model as a column and performance for each class as rows
+    # search the file from the back until the line
+    # cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:
+    # is found
+    target_line = "cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:"
+    model_names = ['Base Cube R-CNN', 'Time-eq.', 'Pseudo GT', 'Proposal', 'Weak loss']
+    df = []
+    for file, model_name in zip(files, model_names):
+        df_i = search_file_backwards(file, target_line).drop(['AP2D'], axis=1).rename(columns={'AP3D':model_name})
+        assert df_i is not None, 'df not found'
+        df.append(df_i)
+        # merge df's
+    df = reduce(lambda x, y: pd.merge(x, y, on = 'category'), df)
+    # sort df by ap3d of model 1
+    df = df.sort_values(by='Base Cube R-CNN', ascending=False)
+    # encode each category as a number
+    df['category_num'] = list(reversed([i for i in range(len(df))]))
+    # https://plotly.com/python/parallel-coordinates-plot/
+    fig = go.Figure(data=
+    go.Parcoords(
+        line = dict(color = df.iloc[:, 1],
+                #    colorscale = [[0,'purple'],[0.5,'lightseagreen'],[1,'gold']]),
+                    colorscale = 'Viridis'),
+                    visible = True,
+        dimensions = list([
+            dict(tickvals = df['category_num'],
+                ticktext = df['category'],
+                label = 'Categories', values = df['category_num']),
+            dict(range = [0,70],
+                constraintrange = [5,70],
+                label = model_names[0], values = df[model_names[0]]),
+            dict(range = [0,40],
+                label = model_names[2], values = df[model_names[2]]),
+            dict(range = [0,40],
+                label = model_names[4], values = df[model_names[4]]),
+            dict(range = [0,40],
+                label = model_names[1], values = df[model_names[1]]),
+            dict(range = [0,40],
+                label = model_names[3], values = df[model_names[3]]),
+            ]),
+        )
+    )
+    fig.update_layout(
+        plot_bgcolor = 'white',
+        paper_bgcolor = 'white',
+        title={
+            'text': "AP3D per category for each model",
+            'y':0.96,
+            'x':0.5,
+            'xanchor': 'center',
+            'yanchor': 'top'},
+        margin=dict(l=65, r=25, t=80, b=5)
+    )
+    # pip install --upgrade "kaleido==0.1.*"
+    fig.write_image('output/figures/SUNRGBD/parallel_coordinate_plot.png', scale=3, format='png')
+    # fig.show()
+if __name__ == '__main__':
+    # show_data('SUNRGBD', filter_invalid=False, output_dir='output/playground/no_filter')  #{SUNRGBD,ARKitScenes,KITTI,nuScenes,Objectron,Hypersim}
+    # show_data('SUNRGBD', filter_invalid=True, output_dir='output/playground/with_filter')  #{SUNRGBD,ARKitScenes,KITTI,nuScenes,Objectron,Hypersim}
+    # _ = category_distribution('SUNRGBD')
+    AP_vs_no_of_classes('SUNRGBD')
+    #spatial_statistics('SUNRGBD')
+    # AP3D_vs_AP2D('SUNRGBD')
+    # AP3D_vs_AP2D('SUNRGBD', mode='log')
+    # init_dataloader()
+    # vol_over_cat('SUNRGBD')
+    # gt_stats('SUNRGBD')
+    # gt_stats_in_terms_of_sigma('SUNRGBD')
+    #gt_stats('SUNRGBD')
+    # report_figures('SUNRGBD')
+    parallel_coordinate_plot()

app.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import numpy as np
+import gradio as gr
+import os
+import sys
+import numpy as np
+import torch
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import transforms as T
+sys.path.append(os.getcwd())
+np.set_printoptions(suppress=True)
+from cubercnn.config import get_cfg_defaults
+from cubercnn.modeling.meta_arch import build_model
+from cubercnn.modeling.backbone import build_dla_from_vision_fpn_backbone # this must be here even though it is not used
+from cubercnn import util, vis
+def do_test(im, threshold, model_str):
+    if im is None:
+        return None, None
+    model = load_model_config()
+    model.eval()
+    thres = threshold
+    min_size = 512
+    max_size = 4096
+    augmentations = T.AugmentationList([T.ResizeShortestEdge(min_size, max_size, "choice")])
+    category_path = 'configs/category_meta.json'
+    # store locally if needed
+    if category_path.startswith(util.CubeRCNNHandler.PREFIX):
+        category_path = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, category_path)
+    metadata = util.load_json(category_path)
+    cats = metadata['thing_classes']
+    image_shape = im.shape[:2]  # h, w
+    h, w = image_shape
+    focal_length_ndc = 4.0
+    focal_length = focal_length_ndc * h / 2
+    px, py = w/2, h/2
+    K = np.array([
+        [focal_length, 0.0, px],
+        [0.0, focal_length, py],
+        [0.0, 0.0, 1.0]
+    ])
+    # dummy
+    aug_input = T.AugInput(im)
+    tfms = augmentations(aug_input)
+    image = aug_input.image
+    # model.to(device)
+    batched = [{
+            'image': torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))),
+            'height': image_shape[0], 'width': image_shape[1], 'K': K
+        }]
+    with torch.no_grad():
+        dets = model(batched)[0]['instances']
+    n_det = len(dets)
+    meshes = []
+    meshes_text = []
+    if n_det > 0:
+        for idx, (corners3D, center_cam, center_2D, dimensions, pose, score, cat_idx) in enumerate(zip(
+                dets.pred_bbox3D, dets.pred_center_cam, dets.pred_center_2D, dets.pred_dimensions,
+                dets.pred_pose, dets.scores, dets.pred_classes
+            )):
+            # skip
+            if score < thres:
+                continue
+            cat = cats[cat_idx]
+            bbox3D = center_cam.tolist() + dimensions.tolist()
+            meshes_text.append('{} {:.2f}'.format(cat, score))
+            color = [c/255.0 for c in util.get_color(idx)]
+            box_mesh = util.mesh_cuboid(bbox3D, pose.tolist(), color=color)
+            meshes.append(box_mesh)
+    # print('File with {} dets'.format(len(meshes)))
+    if len(meshes) > 0:
+        im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(im, K, meshes, text=meshes_text, scale=im.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
+        im_drawn_rgb, im_topdown = im_drawn_rgb.astype(np.uint8), im_topdown.astype(np.uint8)
+    else:
+        im_drawn_rgb, im_topdown = im.astype(np.uint8), None
+    return im_drawn_rgb, im_topdown
+def setup(config_file):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    get_cfg_defaults(cfg)
+    # store locally if needed
+    if config_file.startswith(util.CubeRCNNHandler.PREFIX):
+        config_file = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, config_file)
+    cfg.merge_from_file(config_file)
+    cfg.freeze()
+    return cfg
+def main(config_file, weigths=None):
+    cfg = setup(config_file)
+    model = build_model(cfg)
+    DetectionCheckpointer(model).resume_or_load(
+        weigths, resume=True
+    )
+    return cfg, model
+if __name__ == "__main__":
+    def load_model_config():
+        config_file =  "configs/Omni_combined.yaml"
+        MODEL_WEIGHTS = "output/weak_cube_r-cnn/model_final.pth"
+        cfg, model = main(config_file, MODEL_WEIGHTS)
+        return model
+    title = 'Weak Cube R-CNN'
+    description = "This showcases the different our model [[`Weak Cube RCNN`](https://arxiv.org/abs/2504.13297). To create Weak Cube RCNN, we modify the framework by replacing its 3D loss functions with ones based solely on 2D annotations. Our methods rely heavily on external, strong generalised deep learning models to infer spatial information in scenes. Experimental results show that all models perform comparably to an annotation time-equalised Cube R-CNN, whereof the pseudo ground truth method achieves the highest accuracy. The results show the methods' ability to understand scenes in 3D, providing satisfactory visual results. Although not precise enough for centimetre accurate measurements, the method provide a solid foundation for further research. \n Check out the code on [GitHub](https://github.com/AndreasLH/Weak-Cube-R-CNN)"
+    demo = gr.Interface(
+        title=title,
+        fn=do_test,
+        inputs=[
+            gr.Image(label="Input Image"),
+            gr.Slider(0, 1, value=0.25, label="Threshold", info="Only show predictions with a confidence above this threshold"),
+            gr.Textbox(value="Weak Cube R-CNN", visible=False, render=False)
+            ],
+        outputs=[gr.Image(label="Predictions"), gr.Image(label="Top view")],
+            description=description,
+            allow_flagging='never',
+            examples=[["datasets/examples/ex2.jpg"],[],[],["datasets/examples/ex1.jpg"]],
+        )
+    # demo.launch(server_name="0.0.0.0", server_port=7860)
+    demo.launch()

configs/Base.yaml ADDED Viewed

	@@ -0,0 +1,89 @@

+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 32
+  BASE_LR: 0.02
+  STEPS: (19200, 25600)
+  MAX_ITER: 32000
+  WEIGHT_DECAY: 0.0001
+  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
+INPUT:
+  MIN_SIZE_TRAIN: (256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640,)
+  MIN_SIZE_TEST: 512
+  MAX_SIZE_TRAIN: 4096
+  MAX_SIZE_TEST: 4096
+TEST:
+  VISIBILITY_THRES: 0.33333333
+  TRUNCATION_THRES: 0.33333333
+  EVAL_PERIOD: 16000
+DATASETS:
+  TRAIN: ('KITTI_train', 'KITTI_val')
+  TEST: ('KITTI_test',)
+  CATEGORY_NAMES: ('pedestrian', 'car', 'cyclist', 'van', 'truck', 'tram', 'person')
+  IGNORE_NAMES: "['dontcare', 'ignore', 'void']"
+  MIN_HEIGHT_THRES: 0.05
+  TRUNCATION_THRES: 0.75
+  VISIBILITY_THRES: 0.25
+  TRUNC_2D_BOXES: True
+VIS_PERIOD: 640
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.1
+MODEL:
+  PIXEL_MEAN: [103.530, 116.280, 123.675]
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  META_ARCHITECTURE: "RCNN3D"
+  MASK_ON: False
+  STABILIZE: 0.02
+  USE_BN: True
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: 'build_dla_from_vision_fpn_backbone'
+  DLA:
+    TYPE: 'dla34'
+  FPN:
+    IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    HEAD_NAME: "StandardRPNHead"
+    IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+    BOUNDARY_THRESH: -1
+    OBJECTNESS_UNCERTAINTY: "IoUness"
+    IOU_THRESHOLDS: [0.05, 0.05]
+    POSITIVE_FRACTION: 1.0
+  PROPOSAL_GENERATOR:
+    NAME: "RPNWithIgnore"
+  ROI_HEADS:
+    NAME: "ROIHeads3D"
+    IN_FEATURES: ["p2", "p3", "p4", "p5", 'p6']
+    BATCH_SIZE_PER_IMAGE: 512
+    SCORE_THRESH_TEST: 0.01
+    NUM_CLASSES: 43
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_CUBE_HEAD:
+    NAME: 'CubeHead'
+    Z_TYPE: 'direct'
+    POSE_TYPE: '6d'
+    NUM_FC: 2
+    SHARED_FC: True
+    USE_CONFIDENCE: 1.0
+    LOSS_W_3D: 1.0
+    POOLER_TYPE: 'ROIAlignV2'
+    POOLER_RESOLUTION: 7
+    DISENTANGLED_LOSS: True
+    ALLOCENTRIC_POSE: True
+    VIRTUAL_FOCAL: 512.0
+    VIRTUAL_DEPTH: True
+    CHAMFER_POSE: True
+    TEST: 'blasss'
+    DIMS_PRIORS_ENABLED: True
+    DIMS_PRIORS_PRECOMPUTED: False
+VERSION: 2

configs/Base_Omni3D.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 2 #196 -> r=5,6 -> because of dataset size r=5,6 * 10,335/233 = 0,248
+  BASE_LR: 0.0214 #0.12
+  STEPS: (17280, 23040)
+  MAX_ITER: 100000 #116000
+  WARMUP_ITERS: 0 #3625
+TEST:
+  EVAL_PERIOD: 7200 #29000
+VIS_PERIOD: 1 #2320
+DATASETS:
+  TRAIN: ('SUNRGBD_train_mini', 'SUNRGBD_val_mini')
+  TEST: ('SUNRGBD_test_mini',)
+  CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 50

configs/Base_Omni3D_2D_only.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 6 #196 -> r=5,6 -> because of dataset size r=5,6 * 10,335/233 = 0,248
+  BASE_LR: 0.0214 #0.12
+  STEPS: (30000, 40000)
+  MAX_ITER: 50000 #116000
+  WARMUP_ITERS: 0 #3625
+TEST:
+  EVAL_PERIOD: 25000 #29000
+VIS_PERIOD: 50000 #2320
+DATASETS:
+  TRAIN: ('SUNRGBD_train', 'SUNRGBD_val')
+  TEST: ('SUNRGBD_test',)
+  CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 50
+  ROI_CUBE_HEAD:
+    LOSS_W_3D: 0.0

configs/Base_Omni3D_in.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 128
+  BASE_LR: 0.08
+  STEPS: (69600, 92800)
+  MAX_ITER: 116000
+  WARMUP_ITERS: 3625
+TEST:
+  EVAL_PERIOD: 29000
+VIS_PERIOD: 2320
+DATASETS:
+  TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val')
+  TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test')
+  CATEGORY_NAMES: ('stationery', 'sink', 'table', 'floor mat', 'bottle', 'bookcase', 'bin', 'blinds', 'pillow', 'bicycle', 'refrigerator', 'night stand', 'chair', 'sofa', 'books', 'oven', 'towel', 'cabinet', 'window', 'curtain', 'bathtub', 'laptop', 'desk', 'television', 'clothes', 'stove', 'cup', 'shelves', 'box', 'shoes', 'mirror', 'door', 'picture', 'lamp', 'machine', 'counter', 'bed', 'toilet')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 38

configs/Base_Omni3D_og.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 192
+  BASE_LR: 0.12
+  STEPS: (69600, 92800)
+  MAX_ITER: 116000
+  WARMUP_ITERS: 3625
+TEST:
+  EVAL_PERIOD: 29000
+VIS_PERIOD: 2320
+DATASETS:
+  TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
+  TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test')
+  CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 50

configs/Base_Omni3D_out.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 32
+  BASE_LR: 0.02
+  STEPS: (69600, 92800)
+  MAX_ITER: 116000
+  WARMUP_ITERS: 3625
+TEST:
+  EVAL_PERIOD: 29000
+VIS_PERIOD: 2320
+DATASETS:
+  TRAIN: ('nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
+  TEST: ('nuScenes_test', 'KITTI_test')
+  CATEGORY_NAMES: ('cyclist', 'pedestrian', 'trailer', 'bus', 'motorcycle', 'car', 'barrier', 'truck', 'van', 'traffic cone', 'bicycle')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 11

configs/Base_Omni3D_prof.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 2 #196 -> r=5,6 -> because of dataset size r=5,6 * 10,335/233 = 0,248
+  BASE_LR: 0.001224489796 #0.12
+  STEPS: (172, 230)
+  MAX_ITER: 288 #116000
+  WARMUP_ITERS: 9 #3625
+TEST:
+  EVAL_PERIOD: 72 #29000
+VIS_PERIOD: 6 #2320
+DATASETS:
+  TRAIN: ('SUNRGBD_train', 'SUNRGBD_val')
+  TEST: ('SUNRGBD_test',)
+  CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 50

configs/Omni_combined.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 25
+  BASE_LR: 0.015
+  STEPS: (35000, 40000)
+  MAX_ITER: 42001
+  WARMUP_ITERS: 0
+  CHECKPOINT_PERIOD: 1000
+TEST:
+  EVAL_PERIOD: 100000
+VIS_PERIOD: 1000
+DATASETS:
+  TRAIN: ('SUNRGBD_train', 'SUNRGBD_val') #, 'KITTI_train_mini', 'KITTI_val_mini')
+  TEST: ('SUNRGBD_test',) # 'KITTI_test_mini')
+  CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
+MODEL:
+  DEVICE: 'cpu'
+  DEPTH_ON: False #whether to use the depth anything concated features # if do not use this, then we can use ["p2", "p3", "p4", "p5", "p6"], [[32], [64], [128], [256], [512]], otherwise only ["p2", "p3", "p4", "p5"], [[32], [64], [128], [256]]
+  FPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+  ROI_HEADS:
+    NAME: 'ROIHeads3DScore' # name of the class that is the 3d predictor
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    NUM_CLASSES: 50
+    POSITIVE_FRACTION: 0.25 # we can use this to control the ratio of positive to negative sampled cubes in
+  ROI_CUBE_HEAD:
+    NAME: 'CubeHead' # name of the 3d head
+    DIMS_PRIORS_ENABLED: True
+    POOLER_TYPE: 'ROIAlignV2'
+    POOLER_RESOLUTION: 7
+    LOSS_W_3D: 1.0
+  META_ARCHITECTURE: 'RCNN3D_combined_features' # name of the overall arch that calls the ROI_HEADS.NAME and ROI_CUBE_HEAD.NAME

configs/category_meta.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"thing_classes": ["pedestrian", "car", "cyclist", "van", "truck", "traffic cone", "barrier", "motorcycle", "bicycle", "bus", "trailer", "books", "bottle", "camera", "cereal box", "chair", "cup", "laptop", "shoes", "towel", "blinds", "window", "lamp", "shelves", "mirror", "sink", "cabinet", "bathtub", "door", "toilet", "desk", "box", "bookcase", "picture", "table", "counter", "bed", "night stand", "pillow", "sofa", "television", "floor mat", "curtain", "clothes", "stationery", "refrigerator", "bin", "stove", "oven", "machine"], "thing_dataset_id_to_contiguous_id": {"0": 0, "1": 1, "3": 2, "4": 3, "5": 4, "8": 5, "9": 6, "10": 7, "11": 8, "12": 9, "13": 10, "14": 11, "15": 12, "16": 13, "17": 14, "18": 15, "19": 16, "20": 17, "21": 18, "22": 19, "23": 20, "24": 21, "25": 22, "26": 23, "27": 24, "28": 25, "29": 26, "30": 27, "31": 28, "32": 29, "33": 30, "34": 31, "35": 32, "36": 33, "37": 34, "38": 35, "39": 36, "40": 37, "42": 38, "43": 39, "44": 40, "45": 41, "46": 42, "47": 43, "48": 44, "49": 45, "52": 46, "53": 47, "57": 48, "61": 49}}

configs/cubercnn_DLA34_FPN.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+_BASE_: "Base_Omni3D.yaml"
+MODEL:
+  BACKBONE:
+    NAME: 'build_dla_from_vision_fpn_backbone'
+  DLA:
+    TYPE: 'dla34'

configs/cubercnn_ResNet34_FPN.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+_BASE_: "Base_Omni3D.yaml"
+MODEL:
+  BACKBONE:
+    NAME: 'build_resnet_from_vision_fpn_backbone'
+  RESNETS:
+    DEPTH: 34
+    TORCHVISION: True

configs/cubercnn_densenet_FPN.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+_BASE_: "Base_Omni3D.yaml"
+MODEL:
+  BACKBONE:
+    NAME: 'build_densenet_fpn_backbone'

configs/cubercnn_mnasnet_FPN.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+_BASE_: "Base_Omni3D.yaml"
+MODEL:
+  BACKBONE:
+    NAME: 'build_mnasnet_fpn_backbone'

configs/cubercnn_shufflenet_FPN.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+_BASE_: "Base_Omni3D.yaml"
+MODEL:
+  BACKBONE:
+    NAME: 'build_shufflenet_fpn_backbone'

cubercnn/config/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .config import *

cubercnn/config/config.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+from detectron2.config import CfgNode as CN
+def get_cfg_defaults(cfg):
+    # A list of category names which will be used
+    cfg.DATASETS.CATEGORY_NAMES = []
+    # The category names which will be treated as ignore
+    # e.g., not counting as background during training
+    # or as false positives during evaluation.
+    cfg.DATASETS.IGNORE_NAMES = []
+    # Should the datasets appear with the same probabilty
+    # in batches (e.g., the imbalance from small and large
+    # datasets will be accounted for during sampling)
+    cfg.DATALOADER.BALANCE_DATASETS = False
+    # The thresholds for when to treat a known box
+    # as ignore based on too heavy of truncation or
+    # too low of visibility in the image. This affects
+    # both training and evaluation ignores.
+    cfg.DATASETS.TRUNCATION_THRES = 0.99
+    cfg.DATASETS.VISIBILITY_THRES = 0.01
+    cfg.DATASETS.MIN_HEIGHT_THRES = 0.00
+    cfg.DATASETS.MAX_DEPTH = 1e8
+    # Whether modal 2D boxes should be loaded,
+    # or if the full 3D projected boxes should be used.
+    cfg.DATASETS.MODAL_2D_BOXES = False
+    # Whether truncated 2D boxes should be loaded,
+    # or if the 3D full projected boxes should be used.
+    cfg.DATASETS.TRUNC_2D_BOXES = True
+    # Threshold used for matching and filtering boxes
+    # inside of ignore regions, within the RPN and ROIHeads
+    cfg.MODEL.RPN.IGNORE_THRESHOLD = 0.5
+    # Configuration for cube head
+    cfg.MODEL.ROI_CUBE_HEAD = CN()
+    cfg.MODEL.ROI_CUBE_HEAD.NAME = "CubeHead"
+    cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION = 7
+    cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO = 0
+    cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE = "ROIAlignV2"
+    # Settings for the cube head features
+    cfg.MODEL.ROI_CUBE_HEAD.NUM_CONV = 0
+    cfg.MODEL.ROI_CUBE_HEAD.CONV_DIM = 256
+    cfg.MODEL.ROI_CUBE_HEAD.NUM_FC = 2
+    cfg.MODEL.ROI_CUBE_HEAD.FC_DIM = 1024
+    # proposal method
+    cfg.MODEL.ROI_CUBE_HEAD.NUMBER_OF_PROPOSALS = 1000
+    # the style to predict Z with currently supported
+    # options --> ['direct', 'sigmoid', 'log', 'clusters']
+    cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE = "direct"
+    # the style to predict pose with currently supported
+    # options --> ['6d', 'euler', 'quaternion']
+    cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE = "6d"
+    # Whether to scale all 3D losses by inverse depth
+    cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT = False
+    # Virtual depth puts all predictions of depth into
+    # a shared virtual space with a shared focal length.
+    cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH = True
+    cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL = 512.0
+    # If true, then all losses are computed using the 8 corners
+    # such that they are all in a shared scale space.
+    # E.g., their scale correlates with their impact on 3D IoU.
+    # This way no manual weights need to be set.
+    cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS = True
+    # When > 1, the outputs of the 3D head will be based on
+    # a 2D scale clustering, based on 2D proposal height/width.
+    # This parameter describes the number of bins to cluster.
+    cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS = 1
+    # Whether batch norm is enabled during training.
+    # If false, all BN weights will be frozen.
+    cfg.MODEL.USE_BN = True
+    # Whether to predict the pose in allocentric space.
+    # The allocentric space may correlate better with 2D
+    # images compared to egocentric poses.
+    cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE = True
+    # Whether to use chamfer distance for disentangled losses
+    # of pose. This avoids periodic issues of rotation but
+    # may prevent the pose "direction" from being interpretable.
+    cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE = True
+    # Should the prediction heads share FC features or not.
+    # These include groups of uv, z, whl, pose.
+    cfg.MODEL.ROI_CUBE_HEAD.SHARED_FC = True
+    # Check for stable gradients. When inf is detected, skip the update.
+    # This prevents an occasional bad sample from exploding the model.
+    # The threshold below is the allows percent of bad samples.
+    # 0.0 is off, and 0.01 is recommended for minor robustness to exploding.
+    cfg.MODEL.STABILIZE = 0.01
+    # Whether or not to use the dimension priors
+    cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED = True
+    # How prior dimensions should be computed?
+    # The supported modes are ["exp", "sigmoid"]
+    # where exp is unbounded and sigmoid is bounded
+    # between +- 3 standard deviations from the mean.
+    cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC = 'exp'
+    # weight for confidence loss. 0 is off.
+    cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE = 1.0
+    # Loss weights for XY, Z, Dims, Pose
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D = 1.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_XY = 1.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE = 7.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_NORMAL_VEC = 20.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_IOU = 1.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_SEG = 2.5
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z = 1.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS = 20.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DEPTH = 1.0
+    cfg.MODEL.DLA = CN()
+    # Supported types for DLA backbones are...
+    # dla34, dla46_c, dla46x_c, dla60x_c, dla60, dla60x, dla102x, dla102x2, dla169
+    cfg.MODEL.DLA.TYPE = 'dla34'
+    # Only available for dla34, dla60, dla102
+    cfg.MODEL.DLA.TRICKS = False
+    # A joint loss for the disentangled loss.
+    # All predictions are computed using a corner
+    # or chamfers loss depending on chamfer_pose!
+    # Recommened to keep this weight small: [0.05, 0.5]
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_JOINT = 1.0
+    # sgd, adam, adam+amsgrad, adamw, adamw+amsgrad
+    cfg.SOLVER.TYPE = 'sgd'
+    cfg.MODEL.RESNETS.TORCHVISION = True
+    cfg.TEST.DETECTIONS_PER_IMAGE = 100
+    cfg.TEST.VISIBILITY_THRES = 1/2.0
+    cfg.TEST.TRUNCATION_THRES = 1/2.0
+    cfg.INPUT.RANDOM_FLIP = "horizontal"
+    # When True, we will use localization uncertainty
+    # as the new IoUness score in the RPN.
+    cfg.MODEL.RPN.OBJECTNESS_UNCERTAINTY = 'IoUness'
+    # If > 0.0 this is the scaling factor that will be applied to
+    # an RoI 2D box before doing any pooling to give more context.
+    # Ex. 1.5 makes width and height 50% larger.
+    cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES = 0.0
+    # weight path specifically for pretraining (no checkpointables will be loaded)
+    cfg.MODEL.WEIGHTS_PRETRAIN = ''
+    # ## start of our things
+    cfg.MODEL.ROI_CUBE_HEAD.TEST = 'bas'
+    cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_PRECOMPUTED = False
+    cfg.PLOT = CN(new_allowed=True)
+    cfg.PLOT.OUTPUT_DIR = ''
+    cfg.PLOT.EVAL = ''
+    cfg.PLOT.MODE2D = '' #either GT or PRED
+    cfg.PLOT.SCORING_FUNC = None
+    cfg.PLOT.PROPOSAL_FUNC = None
+    cfg.PLOT.number_of_proposals = 1000
+    cfg.TRAIN = CN(new_allowed=True)
+    cfg.TRAIN.pseudo_gt = 'learn'
+    # these are meant to be overwritten as an argument
+    cfg.log = True
+    # (these 2 are mutually exclusive) z_pseudo_gt_patch or z_pseudo_gt_center
+    cfg.loss_functions = ['iou']
+    cfg.MODEL.DEPTH_ON = False #whether to use the depth anything concated features

cubercnn/data/Omni_to_kitti.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import torch
+from detectron2.data.catalog import MetadataCatalog
+from cubercnn import data
+from detectron2.structures import Boxes, BoxMode
+from cubercnn.util.math_util import estimate_truncation, mat2euler, R_to_allocentric
+import os
+import numpy as np
+from tqdm import tqdm
+def perp_vector(a, b):
+    return np.array([b, -a])
+def rotate_vector(x, y, theta):
+    # Calculate the rotated coordinates
+    x_rotated = x * np.cos(theta) - y * np.sin(theta)
+    y_rotated = x * np.sin(theta) + y * np.cos(theta)
+    return np.array([x_rotated, y_rotated])
+def calculate_alpha(location, ry):
+    '''
+    location: x, y, z coordinates
+    ry: rotation around y-axis, negative counter-clockwise,
+    positive x-axis is to the right
+    calculate the angle from a line perpendicular to the camera to the center of the bounding box'''
+    # get vector from camera to object
+    ry = -ry
+    x, y, z = location
+    # vector from [0,0,0] to the center of the bounding box
+    # we can do the whole thing in 2D, top down view
+    # vector perpendicular to center
+    perpendicular = perp_vector(x,z)
+    # vector corresponding to ry
+    ry_vector = np.array([np.cos(ry), np.sin(ry)])
+    # angle between perpendicular and ry_vector
+    dot = perpendicular[0]*ry_vector[0] + perpendicular[1]*ry_vector[1]      # Dot product between [x1, y1] and [x2, y2]
+    det = perpendicular[0]*ry_vector[1] - perpendicular[1]*ry_vector[0]      # Determinant
+    alpha = -np.arctan2(det, dot)
+    # wrap to -pi to pi
+    if alpha > np.pi:
+        alpha -= 2*np.pi
+    if alpha < -np.pi:
+        alpha += 2*np.pi
+    return alpha
+def test_calculate_alpha():
+    location = [-3.67, 1.67, 6.05]
+    ry = -1.24
+    expected = -0.72
+    result1 = calculate_alpha(location, ry)
+    location = [-9.48, 2.08, 26.41]
+    ry = 1.77
+    expected = 2.11
+    result2 = calculate_alpha(location, ry)
+    location = [4.19, 1.46, 44.41]
+    ry = -1.35
+    expected = -1.45
+    result3 = calculate_alpha(location, ry)
+    location = [-6.41, 2.04, 46.74]
+    ry = 1.68
+    expected = 1.82
+    result4 = calculate_alpha(location, ry)
+    location = [0.28, 2.08, 17.74]
+    ry = -1.58
+    expected = -1.59
+    result5 = calculate_alpha(location, ry)
+    location = [-3.21, 1.97, 11.22]
+    ry = -0.13
+    expected = 0.15
+    result6 = calculate_alpha(location, ry)
+    # assert np.isclose(result, expected, atol=0.01)
+    return result1
+def main():
+    alpha = test_calculate_alpha()
+    name = 'KITTI'
+    split = 'test'
+    dataset_paths_to_json = [f'datasets/Omni3D/{name}_{split}.json',]
+    os.makedirs('output/KITTI_formatted_predictions', exist_ok=True)
+    # Example 1. load all images
+    dataset = data.Omni3D(dataset_paths_to_json)
+    imgIds = dataset.getImgIds()
+    imgs = dataset.loadImgs(imgIds)
+    # Example 2. load annotations for image index 0
+    annIds = dataset.getAnnIds(imgIds=imgs[0]['id'])
+    anns = dataset.loadAnns(annIds)
+    data.register_and_store_model_metadata(dataset, 'output')
+    thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+    dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+    cats = {'pedestrian', 'car', 'cyclist', 'van', 'truck'}
+    input_folder = 'kitti_omni_eq'
+    out_path = 'output/'+input_folder+'/KITTI_formatted_predictions/'
+    in_path = 'output/'+input_folder+'/KITTI_pred/instances_predictions.pth'
+    print('saving to', out_path)
+    data_json = torch.load(in_path)
+    #
+    # reference
+    # https://github.com/ZrrSkywalker/MonoDETR/blob/c724572bddbc067832a0e0d860a411003f36c2fa/lib/helpers/tester_helper.py#L114
+    files = {}
+    for image in tqdm(data_json):
+        K = image['K']
+        K_inv = np.linalg.inv(K)
+        width, height = image['width'], image['height']
+        image_id = image['image_id']
+        l = []
+        for pred in image['instances']:
+            category = thing_classes[pred['category_id']]
+            if category not in cats:
+                continue
+            occluded = 0
+            # truncation = estimate_truncation(K, torch.tensor([x3d, y3d, z3d, w3d, h3d, l3d]), pred['pose'], width, height)
+            truncation = 0.0 # it does not matter
+            rotation_y = mat2euler(np.array(pred['pose']))[1]
+            bbox = BoxMode.convert(pred['bbox'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) # x1, y1, x2, y2 -> convert to left, top, right, bottom
+            h3d, w3d, l3d = pred['dimensions']
+            # unproject, this should yield the same
+            # cen_2d = np.array(pred['center_2D'] + [1])
+            # z3d = pred['center_cam'][2]
+            # x3d, y3d, z3d = (K_inv @ (z3d*cen_2d))
+            x3d, y3d, z3d = pred['center_cam']
+            location = pred['center_cam']
+            score = pred['score']
+            alpha = calculate_alpha(location, rotation_y)
+            # convert to KITTI format
+            li = [category, truncation, occluded, alpha, bbox[0], bbox[1], bbox[2], bbox[3], h3d, w3d, l3d, x3d, y3d, z3d, rotation_y, score]
+            l.append(li)
+        # sort l by z3d
+        l = sorted(l, key=lambda x: x[13])
+        files[image_id] = l
+    # 7518 test images
+    os.makedirs(out_path, exist_ok=True)
+    for img_id, content in files.items():
+        img_id_str = str(img_id).zfill(6)
+        with open(out_path+f'{img_id_str}.txt', 'w') as f:
+            str_i = ''
+            for i in content:
+                # t = f'{category} {truncation:.2f} {occluded} {alpha:.2f} {bbox[0]:.2f} {bbox[1]:.2f} {bbox[2]:.2f} {bbox[3]:.2f} {w3d:.2f} {h3d:.2f} {l3d:.2f} {x3d:.2f} {y3d:.2f} {z3d:.2f} {rotation_y:.2f} {score:.2f}\n'
+                t = f'{i[0][0].upper() + i[0][1:]} {i[1]:.2f} {i[2]} {i[3]:.2f} {i[4]:.2f} {i[5]:.2f} {i[6]:.2f} {i[7]:.2f} {i[8]:.2f} {i[9]:.2f} {i[10]:.2f} {i[11]:.2f} {i[12]:.2f} {i[13]:.2f} {i[14]:.2f} {i[15]:.2f}\n'
+                str_i += t
+            f.write(str_i)
+if __name__ == '__main__':
+    main()
+# write to file
+# #Values    Name      Description
+# ----------------------------------------------------------------------------
+#    1    type         Describes the type of object: 'Car', 'Van', 'Truck',
+#                      'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
+#                      'Misc' or 'DontCare'
+#    1    truncated    Float from 0 (non-truncated) to 1 (truncated), where
+#                      truncated refers to the object leaving image boundaries
+#    1    occluded     Integer (0,1,2,3) indicating occlusion state:
+#                      0 = fully visible, 1 = partly occluded
+#                      2 = largely occluded, 3 = unknown
+#    1    alpha        Observation angle of object, ranging [-pi..pi]
+#    4    bbox         2D bounding box of object in the image (0-based index):
+#                      contains left, top, right, bottom pixel coordinates
+#    3    dimensions   3D object dimensions: height, width, length (in meters)
+#    3    location     3D object location x,y,z in camera coordinates (in meters)
+#    1    rotation_y   Rotation ry around Y-axis in camera coordinates [-pi..pi]
+#    1    score        Only for results: Float, indicating confidence in
+#                      detection, needuhued for p/r curves, higher is better.
+# output to files 000000.txt 000001.txt ...
+# example file
+# Car 0.00 0 -1.56 564.62 174.59 616.43 224.74 1.61 1.66 3.20 -0.69 1.69 25.01 -1.59
+# Car 0.00 0 1.71 481.59 180.09 512.55 202.42 1.40 1.51 3.70 -7.43 1.88 47.55 1.55
+# Car 0.00 0 1.64 542.05 175.55 565.27 193.79 1.46 1.66 4.05 -4.71 1.71 60.52 1.56
+# Cyclist 0.00 0 1.89 330.60 176.09 355.61 213.60 1.72 0.50 1.95 -12.63 1.88 34.09 1.54
+# DontCare -1 -1 -10 753.33 164.32 798.00 186.74 -1 -1 -1 -1000 -1000 -1000 -10
+# DontCare -1 -1 -10 738.50 171.32 753.27 184.42 -1 -1 -1 -1000 -1000 -1000 -10

cubercnn/data/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .datasets import *
+from .dataset_mapper import *
+from .build import *
+from .builtin import *
+from .Omni_to_kitti import *

cubercnn/data/build.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+import itertools
+import logging
+import numpy as np
+import math
+from collections import defaultdict
+import torch.utils.data
+from detectron2.config import configurable
+from detectron2.utils.logger import _log_api_usage
+from detectron2.data.catalog import DatasetCatalog
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.samplers import (
+    InferenceSampler,
+    RepeatFactorTrainingSampler,
+    TrainingSampler
+)
+from detectron2.data.build import (
+    build_batch_data_loader,
+    trivial_batch_collator
+)
+def filter_images_with_only_crowd_annotations(dataset_dicts):
+    """
+    Filter out images with none annotations or only crowd annotations
+    (i.e., images without non-crowd annotations).
+    A common training-time preprocessing on COCO dataset.
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+    Returns:
+        list[dict]: the same format, but filtered.
+    """
+    num_before = len(dataset_dicts)
+    def valid(anns):
+        for ann in anns:
+            if ann.get("iscrowd", 0) == 0:
+                return True
+        return False
+    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images marked with crowd. {} images left.".format(
+            num_before - num_after, num_after
+        )
+    )
+    return dataset_dicts
+def get_detection_dataset_dicts(names, filter_empty=True, **kwargs):
+    if isinstance(names, str):
+        names = [names]
+    assert len(names), names
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names]
+    for dataset_name, dicts in zip(names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+    has_instances = "annotations" in dataset_dicts[0]
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
+    assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
+    return dataset_dicts
+def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None, dataset_id_to_src=None):
+    if dataset is None:
+        dataset = get_detection_dataset_dicts(
+            cfg.DATASETS.TRAIN,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+            if cfg.MODEL.KEYPOINT_ON
+            else 0,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+        )
+        _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    if sampler is None:
+        sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+        balance_datasets = cfg.DATALOADER.BALANCE_DATASETS
+        logger = logging.getLogger(__name__)
+        logger.info("Using training sampler {}".format(sampler_name))
+        if balance_datasets:
+            assert dataset_id_to_src is not None, 'Need dataset sources.'
+            dataset_source_to_int = {val:i for i, val in enumerate(set(dataset_id_to_src.values()))}
+            dataset_ids_per_img = [dataset_source_to_int[dataset_id_to_src[img['dataset_id']]] for img in dataset]
+            dataset_ids = np.unique(dataset_ids_per_img)
+            # only one source? don't re-weight then.
+            if len(dataset_ids) == 1:
+                weights_per_img = torch.ones(len(dataset_ids_per_img)).float()
+            # compute per-dataset weights.
+            else:
+                counts = np.bincount(dataset_ids_per_img)
+                counts = [counts[id] for id in dataset_ids]
+                weights = [1 - count/np.sum(counts) for count in counts]
+                weights = [weight/np.min(weights) for weight in weights]
+                weights_per_img = torch.zeros(len(dataset_ids_per_img)).float()
+                dataset_ids_per_img = torch.FloatTensor(dataset_ids_per_img).long()
+                # copy weights
+                for dataset_id, weight in zip(dataset_ids, weights):
+                    weights_per_img[dataset_ids_per_img == dataset_id] = weight
+        # no special sampling whatsoever
+        if sampler_name == "TrainingSampler" and not balance_datasets:
+            sampler = TrainingSampler(len(dataset))
+        # balance the weight sampling by datasets
+        elif sampler_name == "TrainingSampler" and balance_datasets:
+            sampler = RepeatFactorTrainingSampler(weights_per_img)
+        # balance the weight sampling by categories
+        elif sampler_name == "RepeatFactorTrainingSampler" and not balance_datasets:
+            repeat_factors = repeat_factors_from_category_frequency(
+                dataset, cfg.DATALOADER.REPEAT_THRESHOLD
+            )
+            sampler = RepeatFactorTrainingSampler(repeat_factors)
+        # balance the weight sampling by categories AND by dataset frequency
+        elif sampler_name == "RepeatFactorTrainingSampler" and balance_datasets:
+            repeat_factors = repeat_factors_from_category_frequency(
+                dataset, cfg.DATALOADER.REPEAT_THRESHOLD
+            )
+            repeat_factors *= weights_per_img
+            repeat_factors /= repeat_factors.min().item()
+            sampler = RepeatFactorTrainingSampler(repeat_factors)
+        else:
+            raise ValueError("Unknown training sampler: {}".format(sampler_name))
+    return {
+        "dataset": dataset,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+    }
+def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh):
+        """
+        Compute (fractional) per-image repeat factors based on category frequency.
+        The repeat factor for an image is a function of the frequency of the rarest
+        category labeled in that image. The "frequency of category c" in [0, 1] is defined
+        as the fraction of images in the training set (without repeats) in which category c
+        appears.
+        See :paper:`lvis` (>= v2) Appendix B.2.
+        Args:
+            dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
+            repeat_thresh (float): frequency threshold below which data is repeated.
+                If the frequency is half of `repeat_thresh`, the image will be
+                repeated twice.
+        Returns:
+            torch.Tensor:
+                the i-th element is the repeat factor for the dataset image at index i.
+        """
+        # 1. For each category c, compute the fraction of images that contain it: f(c)
+        category_freq = defaultdict(int)
+        for dataset_dict in dataset_dicts:  # For each image (without repeats)
+            cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+            for cat_id in cat_ids:
+                if cat_id < 0: continue
+                category_freq[cat_id] += 1
+        num_images = len(dataset_dicts)
+        for k, v in category_freq.items():
+            category_freq[k] = v / num_images
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t / f(c)))
+        category_rep = {
+            cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+        # 3. For each image I, compute the image-level repeat factor:
+        #    r(I) = max_{c in I} r(c)
+        rep_factors = []
+        for dataset_dict in dataset_dicts:
+            cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+            rep_factor = max({category_rep[cat_id] for cat_id in cat_ids if cat_id >= 0}, default=1.0)
+            rep_factors.append(rep_factor)
+        return torch.tensor(rep_factors, dtype=torch.float32)
+@configurable(from_config=_train_loader_from_config)
+def build_detection_train_loader(dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0):
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = TrainingSampler(len(dataset))
+    assert isinstance(sampler, torch.utils.data.Sampler)
+    return build_batch_data_loader(
+        dataset,
+        sampler,
+        total_batch_size,
+        aspect_ratio_grouping=aspect_ratio_grouping,
+        num_workers=num_workers
+    )
+def _test_loader_from_config(cfg, dataset_name, batch_size=1, mapper=None, filter_empty=False):
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+    dataset = get_detection_dataset_dicts(
+        dataset_name,
+        filter_empty=filter_empty,
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    return {"dataset": dataset, "mapper": mapper, 'batch_size':batch_size, "num_workers": cfg.DATALOADER.NUM_WORKERS}
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(dataset, *, mapper, batch_size=1, sampler=None, num_workers=0):
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = InferenceSampler(len(dataset))
+    # Always use 1 image per worker during inference since this is the
+    # standard when reporting inference time in papers.
+    batch_sampler = torch.utils.data.BatchSampler(sampler, batch_size=batch_size, drop_last=False)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=num_workers,
+        batch_sampler=batch_sampler,
+        collate_fn=trivial_batch_collator,
+    )
+    return data_loader

cubercnn/data/builtin.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+def get_omni3d_categories(dataset="omni3d"):
+    """
+    Returns the Omni3D categories for dataset
+    Args:
+        dataset: str
+    Returns:
+        cats: set of strings with category names
+    """
+    if dataset == "omni3d":
+        cats = set({'chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin'})
+        assert len(cats) == 50
+    elif dataset == "omni3d_in":
+        cats = set({'stationery', 'sink', 'table', 'floor mat', 'bottle', 'bookcase', 'bin', 'blinds', 'pillow', 'bicycle', 'refrigerator', 'night stand', 'chair', 'sofa', 'books', 'oven', 'towel', 'cabinet', 'window', 'curtain', 'bathtub', 'laptop', 'desk', 'television', 'clothes', 'stove', 'cup', 'shelves', 'box', 'shoes', 'mirror', 'door', 'picture', 'lamp', 'machine', 'counter', 'bed', 'toilet'})
+        assert len(cats) == 38
+    elif dataset == "omni3d_out":
+        cats = set({'cyclist', 'pedestrian', 'trailer', 'bus', 'motorcycle', 'car', 'barrier', 'truck', 'van', 'traffic cone', 'bicycle'})
+        assert len(cats) == 11
+    elif dataset in ["SUNRGBD_train", "SUNRGBD_val", "SUNRGBD_test", "SUNRGBD_train_mini", "SUNRGBD_val_mini", "SUNRGBD_test_mini", "SUNRGBD_test_mini2", "SUNRGBD_test_mini500"]:
+        cats = set({'bicycle', 'books', 'bottle', 'chair', 'cup', 'laptop', 'shoes', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator', 'bin', 'stove', 'oven', 'machine'})
+        assert len(cats) == 38
+    elif dataset in ["Hypersim_train", "Hypersim_val"]:
+        cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'})
+        assert len(cats) == 29
+    elif dataset == "Hypersim_test":
+        # Hypersim test annotation does not contain toilet
+        cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'})
+        assert len(cats) == 28
+    elif dataset in ["ARKitScenes_train", "ARKitScenes_val", "ARKitScenes_test"]:
+        cats = set({'table', 'bed', 'sofa', 'television', 'refrigerator', 'chair', 'oven', 'machine', 'stove', 'shelves', 'sink', 'cabinet', 'bathtub', 'toilet'})
+        assert len(cats) == 14
+    elif dataset in ["Objectron_train", "Objectron_val", "Objectron_test"]:
+        cats = set({'bicycle', 'books', 'bottle', 'camera', 'cereal box', 'chair', 'cup', 'laptop', 'shoes'})
+        assert len(cats) == 9
+    elif dataset in ["KITTI_train", "KITTI_val", "KITTI_test"]:
+        cats = set({'pedestrian', 'car', 'cyclist', 'van', 'truck'})
+        assert len(cats) == 5
+    elif dataset in ["nuScenes_train", "nuScenes_val", "nuScenes_test"]:
+        cats = set({'pedestrian', 'car', 'truck', 'traffic cone', 'barrier', 'motorcycle', 'bicycle', 'bus', 'trailer'})
+        assert len(cats) == 9
+    else:
+        raise ValueError("%s dataset is not registered." % (dataset))
+    return cats

cubercnn/data/dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,272 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+import copy
+import logging
+from detectron2.config.config import configurable
+from detectron2.data.transforms.augmentation import AugmentationList
+import torch
+import numpy as np
+from detectron2.structures import BoxMode, Keypoints
+from detectron2.data import detection_utils
+from detectron2.data import transforms as T
+from detectron2.data import (
+    DatasetMapper
+)
+from detectron2.structures import (
+    Boxes,
+    BoxMode,
+    Instances,
+)
+from typing import List, Optional, Union
+from PIL import Image
+class DatasetMapper3D(DatasetMapper):
+    @configurable
+    def __init__(
+        self,
+        is_train: bool,
+        *,
+        augmentations: List[Union[T.Augmentation, T.Transform]],
+        image_format: str,
+        mode:str=None,
+        use_instance_mask: bool = False,
+        use_keypoint: bool = False,
+        instance_mask_format: str = "polygon",
+        keypoint_hflip_indices: Optional[np.ndarray] = None,
+        precomputed_proposal_topk: Optional[int] = None,
+        recompute_boxes: bool = False,
+        only_2d: bool = False,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: whether it's used in training or inference
+            mode: 'get_depth_maps' (default), 'cube_rcnn'
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            use_instance_mask: whether to process instance segmentation annotations, if available
+            use_keypoint: whether to process keypoint annotations if available
+            instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation
+                masks into this format.
+            keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices`
+            precomputed_proposal_topk: if given, will load pre-computed
+                proposals from dataset_dict and keep the top k proposals for each image.
+            recompute_boxes: whether to overwrite bounding box annotations
+                by computing tight bounding boxes from instance mask annotations.
+        """
+        if recompute_boxes:
+            assert use_instance_mask, "recompute_boxes requires instance masks"
+        # fmt: off
+        self.is_train               = is_train
+        self.augmentations          = T.AugmentationList(augmentations)
+        self.image_format           = image_format
+        self.use_instance_mask      = use_instance_mask
+        self.instance_mask_format   = instance_mask_format
+        self.use_keypoint           = use_keypoint
+        self.keypoint_hflip_indices = keypoint_hflip_indices
+        self.proposal_topk          = precomputed_proposal_topk
+        self.recompute_boxes        = recompute_boxes
+        self.only_2d                = only_2d
+        self.mode                   = mode
+        # fmt: on
+        logger = logging.getLogger(__name__)
+        mode_out = "training" if is_train else "inference"
+        logger.info(f"[DatasetMapper] Augmentations used in {mode_out}: {augmentations}")
+    @classmethod
+    def from_config(cls, cfg, is_train: bool = True, mode='get_depth_maps'):
+        augs = detection_utils.build_augmentation(cfg, is_train)
+        if cfg.INPUT.CROP.ENABLED and is_train:
+            augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
+            recompute_boxes = cfg.MODEL.MASK_ON
+        else:
+            recompute_boxes = False
+        ret = {
+            "is_train": is_train,
+            "mode": mode,
+            "augmentations": augs,
+            "image_format": cfg.INPUT.FORMAT,
+            "use_instance_mask": cfg.MODEL.MASK_ON,
+            "instance_mask_format": cfg.INPUT.MASK_FORMAT,
+            "use_keypoint": cfg.MODEL.KEYPOINT_ON,
+            "recompute_boxes": recompute_boxes,
+            "only_2d": cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D == 0.0,
+        }
+        if cfg.MODEL.KEYPOINT_ON:
+            ret["keypoint_hflip_indices"] = detection_utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+        if cfg.MODEL.LOAD_PROPOSALS:
+            ret["precomputed_proposal_topk"] = (
+                cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
+                if is_train
+                else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
+            )
+        return ret
+    def __call__(self, dataset_dict):
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = detection_utils.read_image(dataset_dict["file_name"], format=self.image_format)
+        detection_utils.check_image_size(dataset_dict, image)
+        aug_input = T.AugInput(image)
+        # state = torch.get_rng_state()
+        transforms = self.augmentations(aug_input)
+        image = aug_input.image
+        image_shape = image.shape[:2]  # h, w
+        # dont load ground map and depth map when
+        if not self.only_2d:
+            if 'depth_image_path' in dataset_dict:
+                dp_img = Image.fromarray(np.load(dataset_dict["depth_image_path"])['depth'])
+                dp_img = np.array(dp_img.resize(image.shape[:2][::-1], Image.NEAREST))
+                aug_input_dp = T.AugInput(dp_img)
+                aug_only_flip = AugmentationList(transforms[-1:])
+                # torch.set_rng_state(state)
+                #transforms_dp = aug_only_flip(aug_input_dp)
+                dp_image = aug_input_dp.image
+                dataset_dict["depth_map"] = torch.as_tensor(np.ascontiguousarray(dp_image))
+            else:
+                dataset_dict["depth_map"] = None
+            #  ground image
+            if 'ground_image_path' in dataset_dict:
+                ground_img = Image.fromarray(np.load(dataset_dict["ground_image_path"])['mask'])
+                ground_img = np.array(ground_img.resize(image.shape[:2][::-1], Image.NEAREST))
+                aug_input_gr = T.AugInput(ground_img)
+                #transforms_gr = aug_only_flip(aug_input_gr)
+                gr_image = aug_input_gr.image
+                dataset_dict["ground_map"] = torch.as_tensor(np.ascontiguousarray(gr_image))
+            else:
+                dataset_dict["ground_map"] = None
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        # no need for additional processing at inference
+        if not self.is_train:
+            return dataset_dict
+        if "annotations" in dataset_dict:
+            dataset_id = dataset_dict['dataset_id']
+            K = np.array(dataset_dict['K'])
+            unknown_categories = self.dataset_id_to_unknown_cats[dataset_id]
+            # transform and pop off annotations
+            annos = [
+                transform_instance_annotations(obj, transforms, K=K)
+                for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0
+            ]
+            # convert to instance format
+            instances = annotations_to_instances(annos, image_shape, unknown_categories)
+            dataset_dict["instances"] = detection_utils.filter_empty_instances(instances)
+        return dataset_dict
+'''
+Cached for mirroring annotations
+'''
+_M1 = np.array([
+    [1, 0, 0],
+    [0, -1, 0],
+    [0, 0, -1]
+])
+_M2 = np.array([
+    [-1.,  0.,  0.],
+    [ 0., -1.,  0.],
+    [ 0.,  0.,  1.]
+])
+def transform_instance_annotations(annotation, transforms, *, K):
+    if isinstance(transforms, (tuple, list)):
+        transforms = T.TransformList(transforms)
+    # bbox is 1d (per-instance bounding box)
+    bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+    bbox = transforms.apply_box(np.array([bbox]))[0]
+    annotation["bbox"] = bbox
+    annotation["bbox_mode"] = BoxMode.XYXY_ABS
+    if annotation['center_cam'][2] != 0:
+        # project the 3D box annotation XYZ_3D to screen
+        point3D = annotation['center_cam']
+        point2D = K @ np.array(point3D)
+        point2D[:2] = point2D[:2] / point2D[-1]
+        annotation["center_cam_proj"] = point2D.tolist()
+        # apply coords transforms to 2D box
+        annotation["center_cam_proj"][0:2] = transforms.apply_coords(
+            point2D[np.newaxis][:, :2]
+        )[0].tolist()
+        keypoints = (K @ np.array(annotation["bbox3D_cam"]).T).T
+        keypoints[:, 0] /= keypoints[:, -1]
+        keypoints[:, 1] /= keypoints[:, -1]
+        if annotation['ignore']:
+            # all keypoints marked as not visible
+            # 0 - unknown, 1 - not visible, 2 visible
+            keypoints[:, 2] = 1
+        else:
+            valid_keypoints = keypoints[:, 2] > 0
+            # 0 - unknown, 1 - not visible, 2 visible
+            keypoints[:, 2] = 2
+            keypoints[valid_keypoints, 2] = 2
+        # in place
+        transforms.apply_coords(keypoints[:, :2])
+        annotation["keypoints"] = keypoints.tolist()
+        # manually apply mirror for pose
+        for transform in transforms:
+            # horrizontal flip?
+            if isinstance(transform, T.HFlipTransform):
+                pose = _M1 @ np.array(annotation["pose"]) @ _M2
+                annotation["pose"] = pose.tolist()
+                annotation["R_cam"] = pose.tolist()
+    return annotation
+def annotations_to_instances(annos, image_size, unknown_categories):
+    # init
+    target = Instances(image_size)
+    # add classes, 2D boxes, 3D boxes and poses
+    target.gt_classes = torch.tensor([int(obj["category_id"]) for obj in annos], dtype=torch.int64)
+    target.gt_boxes = Boxes([BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos])
+    target.gt_boxes3D = torch.FloatTensor([anno['center_cam_proj'] + anno['dimensions'] + anno['center_cam'] for anno in annos])
+    target.gt_poses = torch.FloatTensor([anno['pose'] for anno in annos])
+    n = len(target.gt_classes)
+    # do keypoints?
+    target.gt_keypoints = Keypoints(torch.FloatTensor([anno['keypoints'] for anno in annos]))
+    gt_unknown_category_mask = torch.zeros(max(unknown_categories)+1, dtype=bool)
+    gt_unknown_category_mask[torch.tensor(list(unknown_categories))] = True
+    # include available category indices as tensor with GTs
+    target.gt_unknown_category_mask = gt_unknown_category_mask.unsqueeze(0).repeat([n, 1])
+    return target

cubercnn/data/datasets.py ADDED Viewed

	@@ -0,0 +1,480 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+import json
+import time
+import os
+import contextlib
+import io
+import logging
+import numpy as np
+import pandas as pd
+from pycocotools.coco import COCO
+from collections import defaultdict
+from fvcore.common.timer import Timer
+from detectron2.utils.file_io import PathManager
+from detectron2.structures import BoxMode
+from detectron2.data import MetadataCatalog, DatasetCatalog
+from cubercnn import util
+VERSION = '0.1'
+logger = logging.getLogger(__name__)
+def get_version():
+    return VERSION
+def get_global_dataset_stats(path_to_stats=None, reset=False):
+    if path_to_stats is None:
+        path_to_stats = os.path.join('datasets', 'Omni3D', 'stats.json')
+    if os.path.exists(path_to_stats) and not reset:
+        stats = util.load_json(path_to_stats)
+    else:
+        stats = {
+            'n_datasets': 0,
+            'n_ims': 0,
+            'n_anns': 0,
+            'categories': []
+        }
+    return stats
+def save_global_dataset_stats(stats, path_to_stats=None):
+    if path_to_stats is None:
+        path_to_stats = os.path.join('datasets', 'Omni3D', 'stats.json')
+    util.save_json(path_to_stats, stats)
+def get_filter_settings_from_cfg(cfg=None):
+    if cfg is None:
+        return {
+            'category_names': [],
+            'ignore_names': [],
+            'truncation_thres': 0.99,
+            'visibility_thres': 0.01,
+            'min_height_thres': 0.00,
+            'max_height_thres': 1.50,
+            'modal_2D_boxes': False,
+            'trunc_2D_boxes': False,
+            'max_depth': 1e8,
+        }
+    else:
+        return {
+            'category_names': cfg.DATASETS.CATEGORY_NAMES,
+            'ignore_names': cfg.DATASETS.IGNORE_NAMES,
+            'truncation_thres': cfg.DATASETS.TRUNCATION_THRES,
+            'visibility_thres': cfg.DATASETS.VISIBILITY_THRES,
+            'min_height_thres': cfg.DATASETS.MIN_HEIGHT_THRES,
+            'modal_2D_boxes': cfg.DATASETS.MODAL_2D_BOXES,
+            'trunc_2D_boxes': cfg.DATASETS.TRUNC_2D_BOXES,
+            'max_depth': cfg.DATASETS.MAX_DEPTH,
+            # TODO expose as a config
+            'max_height_thres': 1.50,
+        }
+def is_ignore(anno, filter_settings, image_height):
+    ignore = anno['behind_camera']
+    ignore |= (not bool(anno['valid3D']))
+    if ignore:
+        return ignore
+    ignore |= anno['dimensions'][0] <= 0.01
+    ignore |= anno['dimensions'][1] <= 0.01
+    ignore |= anno['dimensions'][2] <= 0.01
+    ignore |= anno['center_cam'][2] > filter_settings['max_depth']
+    ignore |= (anno['lidar_pts'] == 0)
+    ignore |= (anno['segmentation_pts'] == 0)
+    ignore |= (anno['depth_error'] > 0.5)
+    # tightly annotated 2D boxes are not always available.
+    if filter_settings['modal_2D_boxes'] and 'bbox2D_tight' in anno and anno['bbox2D_tight'][0] != -1:
+        bbox2D =  BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    # truncated projected 2D boxes are also not always available.
+    elif filter_settings['trunc_2D_boxes'] and 'bbox2D_trunc' in anno and not np.all([val==-1 for val in anno['bbox2D_trunc']]):
+        bbox2D =  BoxMode.convert(anno['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    # use the projected 3D --> 2D box, which requires a visible 3D cuboid.
+    elif 'bbox2D_proj' in anno:
+        bbox2D =  BoxMode.convert(anno['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    else:
+        bbox2D = anno['bbox']
+    ignore |= bbox2D[3] <= filter_settings['min_height_thres']*image_height
+    ignore |= bbox2D[3] >= filter_settings['max_height_thres']*image_height
+    ignore |= (anno['truncation'] >=0 and anno['truncation'] >= filter_settings['truncation_thres'])
+    ignore |= (anno['visibility'] >= 0 and anno['visibility'] <= filter_settings['visibility_thres'])
+    if 'ignore_names' in filter_settings:
+        ignore |= anno['category_name'] in filter_settings['ignore_names']
+    return ignore
+def simple_register(dataset_name, filter_settings, filter_empty=True, datasets_root_path=None):
+    if datasets_root_path is None:
+        datasets_root_path = path_to_json = os.path.join('datasets', 'Omni3D',)
+    path_to_json = os.path.join(datasets_root_path, dataset_name + '.json')
+    path_to_image_root = 'datasets'
+    DatasetCatalog.register(dataset_name, lambda: load_omni3d_json(
+        path_to_json, path_to_image_root,
+        dataset_name, filter_settings, filter_empty=filter_empty
+    ))
+    MetadataCatalog.get(dataset_name).set(json_file=path_to_json, image_root=path_to_image_root, evaluator_type="coco")
+class Omni3D(COCO):
+    '''
+    Class for COCO-like dataset object. Not inherently related to
+    use with Detectron2 or training per se.
+    '''
+    def __init__(self, annotation_files, filter_settings=None):
+        # load dataset
+        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
+        self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
+        self.idx_without_ground = set(pd.read_csv('datasets/no_ground_idx.csv')['img_id'].values)
+        if isinstance(annotation_files, str):
+            annotation_files = [annotation_files,]
+        cats_ids_master = []
+        cats_master = []
+        for annotation_file in annotation_files:
+            _, name, _ = util.file_parts(annotation_file)
+            logger.info('loading {} annotations into memory...'.format(name))
+            dataset = json.load(open(annotation_file, 'r'))
+            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
+            if type(dataset['info']) == list:
+                dataset['info'] = dataset['info'][0]
+            dataset['info']['known_category_ids'] = [cat['id'] for cat in dataset['categories']]
+            # first dataset
+            if len(self.dataset) == 0:
+                self.dataset = dataset
+            # concatenate datasets
+            else:
+                if type(self.dataset['info']) == dict:
+                    self.dataset['info'] = [self.dataset['info']]
+                self.dataset['info'] += [dataset['info']]
+                self.dataset['annotations'] += dataset['annotations']
+                self.dataset['images'] += dataset['images']
+            # sort through categories
+            for cat in dataset['categories']:
+                if not cat['id'] in cats_ids_master:
+                    cats_ids_master.append(cat['id'])
+                    cats_master.append(cat)
+        if filter_settings is None:
+            # include every category in the master list
+            self.dataset['categories'] = [
+                cats_master[i]
+                for i in np.argsort(cats_ids_master)
+            ]
+        else:
+            # determine which categories we may actually use for filtering.
+            trainable_cats = set(filter_settings['ignore_names']) | set(filter_settings['category_names'])
+            # category names are provided to us
+            if len(filter_settings['category_names']) > 0:
+                self.dataset['categories'] = [
+                    cats_master[i]
+                    for i in np.argsort(cats_ids_master)
+                    if cats_master[i]['name'] in filter_settings['category_names']
+                ]
+            # no categories are provided, so assume use ALL available.
+            else:
+                self.dataset['categories'] = [
+                    cats_master[i]
+                    for i in np.argsort(cats_ids_master)
+                ]
+                filter_settings['category_names'] = [cat['name'] for cat in self.dataset['categories']]
+                trainable_cats = trainable_cats | set(filter_settings['category_names'])
+            valid_anns = []
+            im_height_map = {}
+            for im_obj in self.dataset['images']:
+                im_height_map[im_obj['id']] = im_obj['height']
+            # Filter out annotations
+            for anno_idx, anno in enumerate(self.dataset['annotations']):
+                im_height = im_height_map[anno['image_id']]
+                ignore = is_ignore(anno, filter_settings, im_height)
+                if filter_settings['trunc_2D_boxes'] and 'bbox2D_trunc' in anno and not np.all([val==-1 for val in anno['bbox2D_trunc']]):
+                    bbox2D =  BoxMode.convert(anno['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+                elif anno['bbox2D_proj'][0] != -1:
+                    bbox2D = BoxMode.convert(anno['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+                elif anno['bbox2D_tight'][0] != -1:
+                    bbox2D = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+                else:
+                    continue
+                width = bbox2D[2]
+                height = bbox2D[3]
+                self.dataset['annotations'][anno_idx]['area'] = width*height
+                self.dataset['annotations'][anno_idx]['iscrowd'] = False
+                self.dataset['annotations'][anno_idx]['ignore'] = ignore
+                self.dataset['annotations'][anno_idx]['ignore2D'] = ignore
+                self.dataset['annotations'][anno_idx]['ignore3D'] = ignore
+                if filter_settings['modal_2D_boxes'] and anno['bbox2D_tight'][0] != -1:
+                    self.dataset['annotations'][anno_idx]['bbox'] = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+                else:
+                    self.dataset['annotations'][anno_idx]['bbox'] = bbox2D
+                self.dataset['annotations'][anno_idx]['bbox3D'] = anno['bbox3D_cam']
+                self.dataset['annotations'][anno_idx]['depth'] = anno['center_cam'][2]
+                category_name = anno["category_name"]
+                # category is part of trainable categories?
+                if category_name in trainable_cats:
+                    if not ignore:
+                        valid_anns.append(self.dataset['annotations'][anno_idx])
+            self.dataset['annotations'] = valid_anns
+            # append depth image path to each image corresponding to the id
+            # for img in self.dataset['images']:
+            #     img_id = img['id']
+            #     img['depth_image_path'] = f'datasets/depth_maps/{img_id}.npz'
+            #     if not img_id in self.idx_without_ground:
+            #         img['ground_image_path'] = f'datasets/ground_maps/{img_id}.npz'
+        self.createIndex()
+    def info(self):
+        infos = self.dataset['info']
+        if type(infos) == dict:
+            infos = [infos]
+        for i, info in enumerate(infos):
+            print('Dataset {}/{}'.format(i+1, infos))
+            for key, value in info.items():
+                print('{}: {}'.format(key, value))
+def register_and_store_model_metadata(datasets, output_dir, filter_settings=None):
+    output_file = os.path.join(output_dir, 'category_meta.json')
+    if os.path.exists(output_file):
+        metadata = util.load_json(output_file)
+        thing_classes = metadata['thing_classes']
+        id_map = metadata['thing_dataset_id_to_contiguous_id']
+        # json saves id map as strings rather than ints
+        id_map = {int(idA):idB for idA, idB in id_map.items()}
+    else:
+        omni3d_stats = util.load_json(os.path.join('datasets', 'Omni3D', 'stats.json'))
+        thing_classes = filter_settings['category_names']
+        cat_ids = []
+        for cat in thing_classes:
+            cat_idx = omni3d_stats['category_names'].index(cat)
+            cat_id = omni3d_stats['categories'][cat_idx]['id']
+            cat_ids.append(cat_id)
+        cat_order = np.argsort(cat_ids)
+        cat_ids = [cat_ids[i] for i in cat_order]
+        thing_classes = [thing_classes[i] for i in cat_order]
+        id_map = {id: i for i, id in enumerate(cat_ids)}
+        util.save_json(output_file, {
+            'thing_classes': thing_classes,
+            'thing_dataset_id_to_contiguous_id': id_map,
+        })
+    MetadataCatalog.get('omni3d_model').thing_classes = thing_classes
+    MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id  = id_map
+def load_omni3d_json(json_file, image_root, dataset_name, filter_settings, filter_empty=True):
+    # read in the dataset
+    timer = Timer()
+    json_file = PathManager.get_local_path(json_file)
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    ground_map_files = os.listdir('datasets/ground_maps')
+    ground_idx = []
+    for file in ground_map_files:
+        try:
+            idx = int(file.split('.')[0])
+            ground_idx.append(idx)
+        except:
+            pass
+    depth_map_files = os.listdir('datasets/depth_maps')
+    depth_idx = []
+    for file in depth_map_files:
+        try:
+            idx = int(file.split('.')[0])
+            depth_idx.append(idx)
+        except:
+            pass
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+    # the global meta information for the full dataset
+    meta_model = MetadataCatalog.get('omni3d_model')
+    # load the meta information
+    meta = MetadataCatalog.get(dataset_name)
+    cat_ids = sorted(coco_api.getCatIds(filter_settings['category_names']))
+    cats = coco_api.loadCats(cat_ids)
+    thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
+    meta.thing_classes = thing_classes
+    # the id mapping must be based on the model!
+    id_map = meta_model.thing_dataset_id_to_contiguous_id
+    meta.thing_dataset_id_to_contiguous_id = id_map
+    # sort indices for reproducible results
+    img_ids = sorted(coco_api.imgs.keys())
+    imgs = coco_api.loadImgs(img_ids)
+    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+    total_num_valid_anns = sum([len(x) for x in anns])
+    total_num_anns = len(coco_api.anns)
+    if total_num_valid_anns < total_num_anns:
+        logger.info(
+            f"{json_file} contains {total_num_anns} annotations, but only "
+            f"{total_num_valid_anns} of them match to images in the file."
+        )
+    imgs_anns = list(zip(imgs, anns))
+    logger.info("Loaded {} images in Omni3D format from {}".format(len(imgs_anns), json_file))
+    dataset_dicts = []
+    # annotation keys to pass along
+    ann_keys = [
+        "bbox", "bbox3D_cam", "bbox2D_proj", "bbox2D_trunc", "bbox2D_tight",
+        "center_cam", "dimensions", "pose", "R_cam", "category_id",
+    ]
+    # optional per image keys to pass if exists
+    # this property is unique to KITTI.
+    img_keys_optional = ['p2']
+    invalid_count = 0
+    for img_dict, anno_dict_list in imgs_anns:
+        has_valid_annotation = False
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_path"])
+        record["dataset_id"] = img_dict["dataset_id"]
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["K"] = img_dict["K"]
+        # store optional keys when available
+        for img_key in img_keys_optional:
+            if img_key in img_dict:
+                record[img_key] = img_dict[img_key]
+        image_id = record["image_id"] = img_dict["id"]
+        if image_id in depth_idx:
+            record["depth_image_path"] = f'datasets/depth_maps/{image_id}.npz'
+        if image_id in ground_idx:
+            record["ground_image_path"] = f'datasets/ground_maps/{image_id}.npz'
+        objs = []
+        # where invalid annotations are removed
+        for anno in anno_dict_list:
+            assert anno["image_id"] == image_id
+            obj = {key: anno[key] for key in ann_keys if key in anno}
+            obj["bbox_mode"] = BoxMode.XYWH_ABS
+            annotation_category_id = obj["category_id"]
+            # category is not part of ids and is not in the ignore category?
+            if not (annotation_category_id in id_map) and not (anno['category_name'] in filter_settings['ignore_names']):
+                continue
+            ignore = is_ignore(anno, filter_settings, img_dict["height"])
+            obj['iscrowd'] = False
+            obj['ignore'] = ignore
+            if filter_settings['modal_2D_boxes'] and 'bbox2D_tight' in anno and anno['bbox2D_tight'][0] != -1:
+                obj['bbox'] = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+            elif filter_settings['trunc_2D_boxes'] and 'bbox2D_trunc' in anno and not np.all([val==-1 for val in anno['bbox2D_trunc']]):
+                obj['bbox'] =  BoxMode.convert(anno['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+            elif 'bbox2D_proj' in anno:
+                obj['bbox'] = BoxMode.convert(anno['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+            else:
+                continue
+            obj['pose'] = anno['R_cam']
+            # store category as -1 for ignores!
+            # OLD Logic
+            obj["category_id"] = -1 if ignore else id_map[annotation_category_id]
+            objs.append(obj)
+            has_valid_annotation |= (not ignore)
+        if has_valid_annotation or (not filter_empty):
+            record["annotations"] = objs
+            dataset_dicts.append(record)
+        else:
+            invalid_count += 1
+    logger.info("Filtered out {}/{} images without valid annotations".format(invalid_count, len(imgs_anns)))
+    return dataset_dicts

cubercnn/data/filter_ground.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Basically a hotfix script to avoid having to run the ground segemntation script again
+# this will filter out empty ground maps and add the indices to the no_ground_idx.csv file
+# It removes ground maps with very little ground, because we assume that it has found something wrong
+import os
+import torch
+import numpy as np
+from tqdm import tqdm
+import pandas as pd
+files = os.listdir('datasets/ground_maps')
+no_ground = []
+for file in tqdm(files):
+    mask = np.load(f'datasets/ground_maps/{file}')['mask']
+    ground_map = torch.as_tensor(mask)[::5,::5]
+    nnz = torch.count_nonzero(ground_map).item()
+    # 100 is determined from looking at the pictures
+    if nnz < 100:
+        print(nnz)
+        print('indices', file[:-4])
+        no_ground.append(int(file[:-4]))
+        os.remove(f'datasets/ground_maps/{file}')
+df = pd.DataFrame(no_ground, columns=['img_id'])
+df2 = pd.read_csv('datasets/no_ground_idx.csv')
+df = pd.concat([df, df2])
+df.to_csv('datasets/no_ground_idx.csv', index=False)

cubercnn/data/generate_depth_maps.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+import cv2
+# might need to export PYTHONPATH=/work3/$username/3dod/
+from depth.metric_depth.depth_anything_v2.dpt import DepthAnythingV2
+def depth_of_images(encoder='vitl', dataset='hypersim', max_depth=20, device='cpu'):
+    """
+    This function takes in a list of images and returns the depth of the images
+    encoder = 'vitl' # or 'vits', 'vitb'
+    dataset = 'hypersim' # 'hypersim' for indoor model, 'vkitti' for outdoor model
+    max_depth = 20 # 20 for indoor model, 80 for outdoor model
+    """
+    model_configs = {
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
+    }
+    model = DepthAnythingV2(**{**model_configs[encoder], 'max_depth': max_depth})
+    model.load_state_dict(torch.load(f'depth/checkpoints/depth_anything_v2_metric_{dataset}_{encoder}.pth', map_location=device, weights_only=False))
+    model.eval()
+    model.to(device)
+    return model
+def init_dataset():
+    ''' dataloader stuff.
+     I'm not sure what the difference between the omni3d dataset and load omni3D json functions are. this is a 3rd alternative to this. The train script calls something similar to this.'''
+    cfg, filter_settings = get_config_and_filter_settings()
+    dataset_names = ['SUNRGBD_train','SUNRGBD_val','SUNRGBD_test', 'KITTI_train', 'KITTI_val', 'KITTI_test',]
+    dataset_paths_to_json = ['datasets/Omni3D/'+dataset_name+'.json' for dataset_name in dataset_names]
+    # for dataset_name in dataset_names:
+    #     simple_register(dataset_name, filter_settings, filter_empty=True)
+    # Get Image and annotations
+    datasets = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
+    data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
+    thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+    dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+    infos = datasets.dataset['info']
+    dataset_id_to_unknown_cats = {}
+    possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
+    dataset_id_to_src = {}
+    for info in infos:
+        dataset_id = info['id']
+        known_category_training_ids = set()
+        if not dataset_id in dataset_id_to_src:
+            dataset_id_to_src[dataset_id] = info['source']
+        for id in info['known_category_ids']:
+            if id in dataset_id_to_contiguous_id:
+                known_category_training_ids.add(dataset_id_to_contiguous_id[id])
+        # determine and store the unknown categories.
+        unknown_categories = possible_categories - known_category_training_ids
+        dataset_id_to_unknown_cats[dataset_id] = unknown_categories
+    return datasets
+if __name__ == '__main__':
+    import os
+    from detectron2.data.catalog import MetadataCatalog
+    import numpy as np
+    from cubercnn import data
+    from priors import get_config_and_filter_settings
+    from tqdm import tqdm
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    datasets = init_dataset()
+    os.makedirs('datasets/depth_maps', exist_ok=True)
+    model = depth_of_images(device=device)
+    for img_id, img_info in tqdm(datasets.imgs.items()):
+        file_path = img_info['file_path']
+        img = cv2.imread('datasets/'+file_path)
+        depth = model.infer_image(img) # HxW depth map in meters in numpy
+        np.savez_compressed(f'datasets/depth_maps/{img_id}.npz', depth=depth)

cubercnn/data/generate_ground_segmentations.py ADDED Viewed

	@@ -0,0 +1,206 @@

+from segment_anything import sam_model_registry
+from segment_anything.modeling import Sam
+import os
+def init_segmentation(device='cpu') -> Sam:
+    # 1) first cd into the segment_anything and pip install -e .
+    # to get the model stary in the root foler folder and run the download_model.sh
+    # 2) chmod +x download_model.sh && ./download_model.sh
+    # the largest model: https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
+    # this is the smallest model
+    if os.path.exists('sam-hq/sam_hq_vit_b.pth'):
+        sam_checkpoint = "sam-hq/sam_hq_vit_b.pth"
+        model_type = "vit_b"
+    else:
+        sam_checkpoint = "sam-hq/sam_hq_vit_tiny.pth"
+        model_type = "vit_tiny"
+    print(f'SAM device: {device}, model_type: {model_type}')
+    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+    sam.to(device=device)
+    return sam
+if __name__ == '__main__':
+    from segment_anything.utils.transforms import ResizeLongestSide
+    import numpy as np
+    import pandas as pd
+    import torch
+    import torchvision.transforms as T2
+    from matplotlib import pyplot as plt
+    from PIL import Image
+    from tqdm import tqdm
+    from torchvision.ops import box_convert
+    import groundingdino.datasets.transforms as T
+    from cubercnn import data
+    from detectron2.data.catalog import MetadataCatalog
+    from groundingdino.util.inference import load_image, load_model, predict
+    from priors import get_config_and_filter_settings
+    import supervision as sv
+    def init_dataset():
+        ''' dataloader stuff.
+        currently not used anywhere, because I'm not sure what the difference between the omni3d dataset and load omni3D json functions are. this is a 3rd alternative to this. The train script calls something similar to this.'''
+        cfg, filter_settings = get_config_and_filter_settings()
+        dataset_names = ['SUNRGBD_train','SUNRGBD_val','SUNRGBD_test', 'KITTI_train', 'KITTI_val', 'KITTI_test',]
+        dataset_paths_to_json = ['datasets/Omni3D/'+dataset_name+'.json' for dataset_name in dataset_names]
+        # for dataset_name in dataset_names:
+        #     simple_register(dataset_name, filter_settings, filter_empty=True)
+        # Get Image and annotations
+        datasets = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
+        data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
+        thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+        dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+        infos = datasets.dataset['info']
+        dataset_id_to_unknown_cats = {}
+        possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
+        dataset_id_to_src = {}
+        for info in infos:
+            dataset_id = info['id']
+            known_category_training_ids = set()
+            if not dataset_id in dataset_id_to_src:
+                dataset_id_to_src[dataset_id] = info['source']
+            for id in info['known_category_ids']:
+                if id in dataset_id_to_contiguous_id:
+                    known_category_training_ids.add(dataset_id_to_contiguous_id[id])
+            # determine and store the unknown categories.
+            unknown_categories = possible_categories - known_category_training_ids
+            dataset_id_to_unknown_cats[dataset_id] = unknown_categories
+        return datasets
+    def load_image(image_path: str, device) -> tuple[torch.Tensor, torch.Tensor]:
+        transform = T.Compose(
+            [
+                # T.RandomResize([800], max_size=1333),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        transform2 = T2.ToTensor()
+        image_source = Image.open(image_path).convert("RGB")
+        image = transform2(image_source).to(device)
+        image_transformed, _ = transform(image_source, None)
+        return image, image_transformed.to(device)
+    def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: list[str]) -> np.ndarray:
+        """
+        This function annotates an image with bounding boxes and labels.
+        Parameters:
+        image_source (np.ndarray): The source image to be annotated.
+        boxes (torch.Tensor): A tensor containing bounding box coordinates.
+        logits (torch.Tensor): A tensor containing confidence scores for each bounding box.
+        phrases (List[str]): A list of labels for each bounding box.
+        Returns:
+        np.ndarray: The annotated image.
+        """
+        h, w, _ = image_source.shape
+        boxes = boxes * torch.Tensor([w, h, w, h])
+        xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
+        detections = sv.Detections(xyxy=xyxy)
+        labels = [
+            f"{phrase} {logit:.2f}"
+            for phrase, logit
+            in zip(phrases, logits)
+        ]
+        box_annotator = sv.BoxAnnotator()
+        # annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
+        annotated_frame = image_source.copy()
+        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+        return annotated_frame
+    datasets = init_dataset()
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # model.to(device)
+    segmentor = init_segmentation(device=device)
+    os.makedirs('datasets/ground_maps', exist_ok=True)
+    model = load_model("GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", "GroundingDINO/weights/groundingdino_swint_ogc.pth", device=device)
+    TEXT_PROMPT = "ground"
+    BOX_TRESHOLD = 0.35
+    TEXT_TRESHOLD = 0.25
+    noground = 0
+    no_ground_idx = []
+    #  **** to annotate full dataset ****
+    for img_id, img_info in tqdm(datasets.imgs.items()):
+        file_path = img_info['file_path']
+        w = img_info['width']
+        h = img_info['height']
+    #  **** to annotate full dataset ****
+    #  **** to annotate demo images ****
+    # for img_id in tqdm(os.listdir('datasets/coco_examples')):
+    #     file_path = 'coco_examples/'+img_id
+        image_source, image = load_image('datasets/'+file_path, device=device)
+    #  **** to annotate demo images ****
+        boxes, logits, phrases = predict(
+            model=model,
+            image=image,
+            caption=TEXT_PROMPT,
+            box_threshold=BOX_TRESHOLD,
+            text_threshold=TEXT_TRESHOLD,
+            device=device
+        )
+        if len(boxes) == 0:
+            print(f"No ground found for {img_id}")
+            noground += 1
+            # save a ground map that is all zeros
+            no_ground_idx.append(img_id)
+            continue
+        # only want box corresponding to max logit
+        max_logit_idx = torch.argmax(logits)
+        logit = logits[max_logit_idx].unsqueeze(0)
+        box = boxes[max_logit_idx].unsqueeze(0)
+        phrase = [phrases[max_logit_idx]]
+        _, h, w = image_source.shape
+        box = box * torch.tensor([w, h, w, h], device=device)
+        xyxy = box_convert(boxes=box, in_fmt="cxcywh", out_fmt="xyxy")
+        image = image.unsqueeze(0)
+        org_shape = image.shape[-2:]
+        resize_transform = ResizeLongestSide(segmentor.image_encoder.img_size)
+        batched_input = []
+        images = resize_transform.apply_image_torch(image*1.0)# .permute(2, 0, 1).contiguous()
+        for image, boxes in zip(images, xyxy):
+            transformed_boxes = resize_transform.apply_boxes_torch(boxes, org_shape) # Bx4
+            batched_input.append({'image': image, 'boxes': transformed_boxes, 'original_size':org_shape})
+        seg_out = segmentor(batched_input, multimask_output=False)
+        mask_per_image = seg_out[0]['masks']
+        nnz = torch.count_nonzero(mask_per_image, dim=(-2, -1))
+        indices = torch.nonzero(nnz <= 1000).flatten()
+        if len(indices) > 0:
+            noground += 1
+            # save a ground map that is all zeros
+            no_ground_idx.append(img_id)
+        np.savez_compressed(f'datasets/ground_maps/{img_id}.npz', mask=mask_per_image.cpu()[0,0,:,:].numpy())
+    print(f"Could not find ground for {noground} images")
+    df = pd.DataFrame(no_ground_idx, columns=['img_id'])
+    df.to_csv('datasets/no_ground_idx.csv', index=False)

cubercnn/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .omni3d_evaluation import *

cubercnn/evaluation/omni3d_evaluation.py ADDED Viewed

	@@ -0,0 +1,1706 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+import contextlib
+import copy
+import datetime
+import io
+import itertools
+import json
+import logging
+import os
+import time
+from collections import defaultdict
+from typing import List, Union
+from typing import Tuple
+import numpy as np
+import pycocotools.mask as maskUtils
+import torch
+from detectron2.utils.memory import retry_if_cuda_oom
+from detectron2.data import MetadataCatalog, DatasetCatalog
+from detectron2.evaluation.coco_evaluation import COCOEvaluator
+from detectron2.structures import BoxMode
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table, log_every_n_seconds
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+from detectron2.utils.comm import get_world_size, is_main_process
+import detectron2.utils.comm as comm
+from detectron2.evaluation import (
+    DatasetEvaluators, inference_context, DatasetEvaluator
+)
+from collections import OrderedDict, abc
+from contextlib import ExitStack, contextmanager
+from torch import nn
+import logging
+from cubercnn.data import Omni3D
+from pytorch3d import _C
+import torch.nn.functional as F
+from pytorch3d.ops.iou_box3d import _box_planes, _box_triangles
+import cubercnn.vis.logperf as utils_logperf
+from cubercnn.data import (
+    get_omni3d_categories,
+    simple_register
+)
+"""
+This file contains
+* Omni3DEvaluationHelper: a helper object to accumulate and summarize evaluation results
+* Omni3DEval: a wrapper around COCOeval to perform 3D bounding evaluation in the detection setting
+* Omni3DEvaluator: a wrapper around COCOEvaluator to collect results on each dataset
+* Omni3DParams: parameters for the evaluation API
+"""
+logger = logging.getLogger(__name__)
+# Defines the max cross of len(dts) * len(gts)
+# which we will attempt to compute on a GPU.
+# Fallback is safer computation on a CPU.
+# 0 is disabled on GPU.
+MAX_DTS_CROSS_GTS_FOR_IOU3D = 0
+def _check_coplanar(boxes: torch.Tensor, eps: float = 1e-4) -> torch.BoolTensor:
+    """
+    Checks that plane vertices are coplanar.
+    Returns a bool tensor of size B, where True indicates a box is coplanar.
+    """
+    faces = torch.tensor(_box_planes, dtype=torch.int64, device=boxes.device)
+    verts = boxes.index_select(index=faces.view(-1), dim=1)
+    B = boxes.shape[0]
+    P, V = faces.shape
+    # (B, P, 4, 3) -> (B, P, 3)
+    v0, v1, v2, v3 = verts.reshape(B, P, V, 3).unbind(2)
+    # Compute the normal
+    e0 = F.normalize(v1 - v0, dim=-1)
+    e1 = F.normalize(v2 - v0, dim=-1)
+    normal = F.normalize(torch.cross(e0, e1, dim=-1), dim=-1)
+    # Check the fourth vertex is also on the same plane
+    mat1 = (v3 - v0).view(B, 1, -1)  # (B, 1, P*3)
+    mat2 = normal.view(B, -1, 1)  # (B, P*3, 1)
+    return (mat1.bmm(mat2).abs() < eps).view(B)
+def _check_nonzero(boxes: torch.Tensor, eps: float = 1e-8) -> torch.BoolTensor:
+    """
+    Checks that the sides of the box have a non zero area.
+    Returns a bool tensor of size B, where True indicates a box is nonzero.
+    """
+    faces = torch.tensor(_box_triangles, dtype=torch.int64, device=boxes.device)
+    verts = boxes.index_select(index=faces.view(-1), dim=1)
+    B = boxes.shape[0]
+    T, V = faces.shape
+    # (B, T, 3, 3) -> (B, T, 3)
+    v0, v1, v2 = verts.reshape(B, T, V, 3).unbind(2)
+    normals = torch.cross(v1 - v0, v2 - v0, dim=-1)  # (B, T, 3)
+    face_areas = normals.norm(dim=-1) / 2
+    return (face_areas > eps).all(1).view(B)
+def box3d_overlap(
+    boxes_dt: torch.Tensor, boxes_gt: torch.Tensor,
+    eps_coplanar: float = 1e-4, eps_nonzero: float = 1e-8
+) -> torch.Tensor:
+    """
+    Computes the intersection of 3D boxes_dt and boxes_gt.
+    Inputs boxes_dt, boxes_gt are tensors of shape (B, 8, 3)
+    (where B doesn't have to be the same for boxes_dt and boxes_gt),
+    containing the 8 corners of the boxes, as follows:
+        (4) +---------+. (5)
+            | ` .     |  ` .
+            | (0) +---+-----+ (1)
+            |     |   |     |
+        (7) +-----+---+. (6)|
+            ` .   |     ` . |
+            (3) ` +---------+ (2)
+    NOTE: Throughout this implementation, we assume that boxes
+    are defined by their 8 corners exactly in the order specified in the
+    diagram above for the function to give correct results. In addition
+    the vertices on each plane must be coplanar.
+    As an alternative to the diagram, this is a unit bounding
+    box which has the correct vertex ordering:
+    box_corner_vertices = [
+        [0, 0, 0],
+        [1, 0, 0],
+        [1, 1, 0],
+        [0, 1, 0],
+        [0, 0, 1],
+        [1, 0, 1],
+        [1, 1, 1],
+        [0, 1, 1],
+    ]
+    Args:
+        boxes_dt: tensor of shape (N, 8, 3) of the coordinates of the 1st boxes
+        boxes_gt: tensor of shape (M, 8, 3) of the coordinates of the 2nd boxes
+    Returns:
+        iou: (N, M) tensor of the intersection over union which is
+            defined as: `iou = vol / (vol1 + vol2 - vol)`
+    """
+    # Make sure predictions are coplanar and nonzero
+    invalid_coplanar = ~_check_coplanar(boxes_dt, eps=eps_coplanar)
+    invalid_nonzero  = ~_check_nonzero(boxes_dt, eps=eps_nonzero)
+    ious = _C.iou_box3d(boxes_dt, boxes_gt)[1]
+    # Offending boxes are set to zero IoU
+    if invalid_coplanar.any():
+        ious[invalid_coplanar] = 0
+        print('Warning: skipping {:d} non-coplanar boxes at eval.'.format(int(invalid_coplanar.float().sum())))
+    if invalid_nonzero.any():
+        ious[invalid_nonzero] = 0
+        print('Warning: skipping {:d} zero volume boxes at eval.'.format(int(invalid_nonzero.float().sum())))
+    return ious
+class Omni3DEvaluationHelper:
+    def __init__(self,
+            dataset_names,
+            filter_settings,
+            output_folder,
+            iter_label='-',
+            only_2d=False,
+        ):
+        """
+        A helper class to initialize, evaluate and summarize Omni3D metrics.
+        The evaluator relies on the detectron2 MetadataCatalog for keeping track
+        of category names and contiguous IDs. Hence, it is important to set
+        these variables appropriately.
+        # (list[str]) the category names in their contiguous order
+        MetadataCatalog.get('omni3d_model').thing_classes = ...
+        # (dict[int: int]) the mapping from Omni3D category IDs to the contiguous order
+        MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+        Args:
+            dataset_names (list[str]): the individual dataset splits for evaluation
+            filter_settings (dict): the filter settings used for evaluation, see
+                cubercnn/data/datasets.py get_filter_settings_from_cfg
+            output_folder (str): the output folder where results can be stored to disk.
+            iter_label (str): an optional iteration/label used within the summary
+            only_2d (bool): whether the evaluation mode should be 2D or 2D and 3D.
+        """
+        self.dataset_names = dataset_names
+        self.filter_settings = filter_settings
+        self.output_folder = output_folder
+        self.iter_label = iter_label
+        self.only_2d = only_2d
+        # Each dataset evaluator is stored here
+        self.evaluators = OrderedDict()
+        # These are the main evaluation results
+        self.results = OrderedDict()
+        # These store store per-dataset results to be printed
+        self.results_analysis = OrderedDict()
+        self.results_omni3d = OrderedDict()
+        self.overall_imgIds = set()
+        self.overall_catIds = set()
+        # These store the evaluations for each category and area,
+        # concatenated from ALL evaluated datasets. Doing so avoids
+        # the need to re-compute them when accumulating results.
+        self.evals_per_cat_area2D = {}
+        self.evals_per_cat_area3D = {}
+        self.output_folders = {
+            dataset_name: os.path.join(self.output_folder, dataset_name)
+            for dataset_name in dataset_names
+        }
+        for dataset_name in self.dataset_names:
+            # register any datasets that need it
+            if MetadataCatalog.get(dataset_name).get('json_file') is None:
+                simple_register(dataset_name, filter_settings, filter_empty=False)
+            # create an individual dataset evaluator
+            self.evaluators[dataset_name] = Omni3DEvaluator(
+                dataset_name, output_dir=self.output_folders[dataset_name],
+                filter_settings=self.filter_settings, only_2d=self.only_2d,
+                eval_prox=('Objectron' in dataset_name or 'SUNRGBD' in dataset_name),
+                distributed=False, # actual evaluation should be single process
+            )
+            self.evaluators[dataset_name].reset()
+            self.overall_imgIds.update(set(self.evaluators[dataset_name]._omni_api.getImgIds()))
+            self.overall_catIds.update(set(self.evaluators[dataset_name]._omni_api.getCatIds()))
+    def add_predictions(self, dataset_name, predictions):
+        """
+        Adds predictions to the evaluator for dataset_name. This can be any number of
+        predictions, including all predictions passed in at once or in batches.
+        Args:
+            dataset_name (str): the dataset split name which the predictions belong to
+            predictions (list[dict]): each item in the list is a dict as follows:
+                {
+                    "image_id": <int> the unique image identifier from Omni3D,
+                    "K": <np.array> 3x3 intrinsics matrix for the image,
+                    "width": <int> image width,
+                    "height": <int> image height,
+                    "instances": [
+                        {
+                            "image_id":  <int> the unique image identifier from Omni3D,
+                            "category_id": <int> the contiguous category prediction IDs,
+                                which can be mapped from Omni3D's category ID's using
+                                MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+                            "bbox": [float] 2D box as [x1, y1, x2, y2] used for IoU2D,
+                            "score": <float> the confidence score for the object,
+                            "depth": <float> the depth of the center of the object,
+                            "bbox3D": list[list[float]] 8x3 corner vertices used for IoU3D,
+                        }
+                        ...
+                    ]
+                }
+        """
+        # concatenate incoming predictions
+        self.evaluators[dataset_name]._predictions += predictions
+    def save_predictions(self, dataset_name):
+        """
+        Saves the predictions from dataset_name to disk, in a self.output_folder.
+        Args:
+            dataset_name (str): the dataset split name which should be saved.
+        """
+        # save predictions to disk
+        output_folder_dataset = self.output_folders[dataset_name]
+        PathManager.mkdirs(output_folder_dataset)
+        file_path = os.path.join(output_folder_dataset, "instances_predictions.pth")
+        with PathManager.open(file_path, "wb") as f:
+            torch.save(self.evaluators[dataset_name]._predictions, f)
+    def evaluate(self, dataset_name):
+        """
+        Runs the evaluation for an individual dataset split, assuming all
+        predictions have been passed in.
+        Args:
+            dataset_name (str): the dataset split name which should be evalated.
+        """
+        if not dataset_name in self.results:
+            # run evaluation and cache
+            self.results[dataset_name] = self.evaluators[dataset_name].evaluate()
+        results = self.results[dataset_name]
+        logger.info('\n'+results['log_str_2D'].replace('mode=2D', '{} iter={} mode=2D'.format(dataset_name, self.iter_label)))
+        # store the partially accumulated evaluations per category per area
+        for key, item in results['bbox_2D_evals_per_cat_area'].items():
+            if not key in self.evals_per_cat_area2D:
+                self.evals_per_cat_area2D[key] = []
+            self.evals_per_cat_area2D[key] += item
+        if not self.only_2d:
+            # store the partially accumulated evaluations per category per area
+            for key, item in results['bbox_3D_evals_per_cat_area'].items():
+                if not key in self.evals_per_cat_area3D:
+                    self.evals_per_cat_area3D[key] = []
+                self.evals_per_cat_area3D[key] += item
+            logger.info('\n'+results['log_str_3D'].replace('mode=3D', '{} iter={} mode=3D'.format(dataset_name, self.iter_label)))
+        # full model category names
+        category_names = self.filter_settings['category_names']
+        # The set of categories present in the dataset; there should be no duplicates
+        categories = {cat for cat in category_names if 'AP-{}'.format(cat) in results['bbox_2D']}
+        assert len(categories) == len(set(categories))
+        # default are all NaN
+        general_2D, general_3D, omni_2D, omni_3D = (np.nan,) * 4
+        # 2D and 3D performance for categories in dataset; and log
+        general_2D = np.mean([results['bbox_2D']['AP-{}'.format(cat)] for cat in categories])
+        if not self.only_2d:
+            general_3D = np.mean([results['bbox_3D']['AP-{}'.format(cat)] for cat in categories])
+        # 2D and 3D performance on Omni3D categories
+        omni3d_dataset_categories = get_omni3d_categories(dataset_name)  # dataset-specific categories
+        if len(omni3d_dataset_categories - categories) == 0:  # omni3d_dataset_categories is a subset of categories
+            omni_2D = np.mean([results['bbox_2D']['AP-{}'.format(cat)] for cat in omni3d_dataset_categories])
+            if not self.only_2d:
+                omni_3D = np.mean([results['bbox_3D']['AP-{}'.format(cat)] for cat in omni3d_dataset_categories])
+        self.results_omni3d[dataset_name] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
+        # Performance analysis
+        extras_AP15, extras_AP25, extras_AP50, extras_APn, extras_APm, extras_APf = (np.nan,)*6
+        if not self.only_2d:
+            extras_AP15 = results['bbox_3D']['AP15']
+            extras_AP25 = results['bbox_3D']['AP25']
+            extras_AP50 = results['bbox_3D']['AP50']
+            extras_APn = results['bbox_3D']['APn']
+            extras_APm = results['bbox_3D']['APm']
+            extras_APf = results['bbox_3D']['APf']
+        self.results_analysis[dataset_name] = {
+            "iters": self.iter_label,
+            "AP2D": general_2D, "AP3D": general_3D,
+            "AP3D@15": extras_AP15, "AP3D@25": extras_AP25, "AP3D@50": extras_AP50,
+            "AP3D-N": extras_APn, "AP3D-M": extras_APm, "AP3D-F": extras_APf
+        }
+        # Performance per category
+        results_cat = OrderedDict()
+        for cat in category_names:
+            cat_2D, cat_3D = (np.nan,) * 2
+            if 'AP-{}'.format(cat) in results['bbox_2D']:
+                cat_2D = results['bbox_2D']['AP-{}'.format(cat)]
+                if not self.only_2d:
+                    cat_3D = results['bbox_3D']['AP-{}'.format(cat)]
+            if not np.isnan(cat_2D) or not np.isnan(cat_3D):
+                results_cat[cat] = {"AP2D": cat_2D, "AP3D": cat_3D}
+        utils_logperf.print_ap_category_histogram(dataset_name, results_cat)
+    def summarize_all(self,):
+        '''
+        Report collective metrics when possible for the the Omni3D dataset.
+        This uses pre-computed evaluation results from each dataset,
+        which were aggregated and cached while evaluating individually.
+        This process simply re-accumulate and summarizes them.
+        '''
+        # First, double check that we have all the evaluations
+        for dataset_name in self.dataset_names:
+            if not dataset_name in self.results:
+                self.evaluate(dataset_name)
+        thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+        catId2contiguous = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+        ordered_things = [thing_classes[catId2contiguous[cid]] for cid in self.overall_catIds]
+        categories = set(ordered_things)
+        evaluator2D = Omni3Deval(mode='2D')
+        evaluator2D.params.catIds = list(self.overall_catIds)
+        evaluator2D.params.imgIds = list(self.overall_imgIds)
+        evaluator2D.evalImgs = True
+        evaluator2D.evals_per_cat_area = self.evals_per_cat_area2D
+        evaluator2D._paramsEval = copy.deepcopy(evaluator2D.params)
+        evaluator2D.accumulate()
+        summarize_str2D = evaluator2D.summarize()
+        precisions = evaluator2D.eval['precision']
+        metrics = ["AP", "AP50", "AP75", "AP95", "APs", "APm", "APl"]
+        results2D = {
+            metric: float(
+                evaluator2D.stats[idx] * 100 if evaluator2D.stats[idx] >= 0 else "nan"
+            )
+            for idx, metric in enumerate(metrics)
+        }
+        for idx, name in enumerate(ordered_things):
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float("nan")
+            results2D.update({"AP-" + "{}".format(name): float(ap * 100)})
+        if not self.only_2d:
+            evaluator3D = Omni3Deval(mode='3D')
+            evaluator3D.params.catIds = list(self.overall_catIds)
+            evaluator3D.params.imgIds = list(self.overall_imgIds)
+            evaluator3D.evalImgs = True
+            evaluator3D.evals_per_cat_area = self.evals_per_cat_area3D
+            evaluator3D._paramsEval = copy.deepcopy(evaluator3D.params)
+            evaluator3D.accumulate()
+            summarize_str3D = evaluator3D.summarize()
+            precisions = evaluator3D.eval['precision']
+            metrics = ["AP", "AP15", "AP25", "AP50", "APn", "APm", "APf"]
+            results3D = {
+                metric: float(
+                    evaluator3D.stats[idx] * 100 if evaluator3D.stats[idx] >= 0 else "nan"
+                )
+                for idx, metric in enumerate(metrics)
+            }
+            for idx, name in enumerate(ordered_things):
+                precision = precisions[:, :, idx, 0, -1]
+                precision = precision[precision > -1]
+                ap = np.mean(precision) if precision.size else float("nan")
+                results3D.update({"AP-" + "{}".format(name): float(ap * 100)})
+        # All concat categories
+        general_2D, general_3D = (np.nan,) * 2
+        general_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in categories])
+        if not self.only_2d:
+            general_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in categories])
+        # Analysis performance
+        extras_AP15, extras_AP25, extras_AP50, extras_APn, extras_APm, extras_APf = (np.nan,) * 6
+        if not self.only_2d:
+            extras_AP15 = results3D['AP15']
+            extras_AP25 = results3D['AP25']
+            extras_AP50 = results3D['AP50']
+            extras_APn = results3D['APn']
+            extras_APm = results3D['APm']
+            extras_APf = results3D['APf']
+        self.results_analysis["<Concat>"] = {
+            "iters": self.iter_label,
+            "AP2D": general_2D, "AP3D": general_3D,
+            "AP3D@15": extras_AP15, "AP3D@25": extras_AP25, "AP3D@50": extras_AP50,
+            "AP3D-N": extras_APn, "AP3D-M": extras_APm, "AP3D-F": extras_APf
+        }
+        # Omni3D Outdoor performance
+        omni_2D, omni_3D = (np.nan,) * 2
+        omni3d_outdoor_categories = get_omni3d_categories("omni3d_out")
+        if len(omni3d_outdoor_categories - categories) == 0:
+            omni_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in omni3d_outdoor_categories])
+            if not self.only_2d:
+                omni_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in omni3d_outdoor_categories])
+        self.results_omni3d["Omni3D_Out"] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
+        # Omni3D Indoor performance
+        omni_2D, omni_3D = (np.nan,) * 2
+        omni3d_indoor_categories = get_omni3d_categories("omni3d_in")
+        if len(omni3d_indoor_categories - categories) == 0:
+            omni_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in omni3d_indoor_categories])
+            if not self.only_2d:
+                omni_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in omni3d_indoor_categories])
+        self.results_omni3d["Omni3D_In"] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
+        # Omni3D performance
+        omni_2D, omni_3D = (np.nan,) * 2
+        omni3d_categories = get_omni3d_categories("omni3d")
+        if len(omni3d_categories - categories) == 0:
+            omni_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in omni3d_categories])
+            if not self.only_2d:
+                omni_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in omni3d_categories])
+        self.results_omni3d["Omni3D"] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
+        # Per-category performance for the cumulative datasets
+        results_cat = OrderedDict()
+        for cat in self.filter_settings['category_names']:
+            cat_2D, cat_3D = (np.nan,) * 2
+            if 'AP-{}'.format(cat) in results2D:
+                cat_2D = results2D['AP-{}'.format(cat)]
+                if not self.only_2d:
+                    cat_3D = results3D['AP-{}'.format(cat)]
+            if not np.isnan(cat_2D) or not np.isnan(cat_3D):
+                results_cat[cat] = {"AP2D": cat_2D, "AP3D": cat_3D}
+        utils_logperf.print_ap_category_histogram("<Concat>", results_cat)
+        utils_logperf.print_ap_analysis_histogram(self.results_analysis)
+        utils_logperf.print_ap_omni_histogram(self.results_omni3d)
+def inference_on_dataset(model, data_loader):
+    """
+    Run model on the data_loader.
+    Also benchmark the inference speed of `model.__call__` accurately.
+    The model will be used in eval mode.
+    Args:
+        model (callable): a callable which takes an object from
+            `data_loader` and returns some outputs.
+            If it's an nn.Module, it will be temporarily set to `eval` mode.
+            If you wish to evaluate a model in `training` mode instead, you can
+            wrap the given model and override its behavior of `.eval()` and `.train()`.
+        data_loader: an iterable object with a length.
+            The elements it generates will be the inputs to the model.
+    Returns:
+        The return value of `evaluator.evaluate()`
+    """
+    num_devices = get_world_size()
+    distributed = num_devices > 1
+    logger.info("Start inference on {} batches".format(len(data_loader)))
+    total = len(data_loader)  # inference data loader must have a fixed length
+    num_warmup = min(5, total - 1)
+    start_time = time.perf_counter()
+    total_data_time = 0
+    total_compute_time = 0
+    total_eval_time = 0
+    inference_json = []
+    with ExitStack() as stack:
+        if isinstance(model, nn.Module):
+            stack.enter_context(inference_context(model))
+        stack.enter_context(torch.no_grad())
+        start_data_time = time.perf_counter()
+        for idx, inputs in enumerate(data_loader):
+            total_data_time += time.perf_counter() - start_data_time
+            if idx == num_warmup:
+                start_time = time.perf_counter()
+                total_data_time = 0
+                total_compute_time = 0
+                total_eval_time = 0
+            start_compute_time = time.perf_counter()
+            outputs = model(inputs)
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            total_compute_time += time.perf_counter() - start_compute_time
+            start_eval_time = time.perf_counter()
+            for input, output in zip(inputs, outputs):
+                prediction = {
+                    "image_id": input["image_id"],
+                    "K": input["K"],
+                    "width": input["width"],
+                    "height": input["height"],
+                }
+                # convert to json format
+                instances = output["instances"].to('cpu')
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+                # store in overall predictions
+                inference_json.append(prediction)
+            total_eval_time += time.perf_counter() - start_eval_time
+            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
+            data_seconds_per_iter = total_data_time / iters_after_start
+            compute_seconds_per_iter = total_compute_time / iters_after_start
+            eval_seconds_per_iter = total_eval_time / iters_after_start
+            total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
+            if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
+                eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1)))
+                log_every_n_seconds(
+                    logging.INFO,
+                    (
+                        f"Inference done {idx + 1}/{total}. "
+                        f"Dataloading: {data_seconds_per_iter:.4f} s/iter. "
+                        f"Inference: {compute_seconds_per_iter:.4f} s/iter. "
+                        f"Eval: {eval_seconds_per_iter:.4f} s/iter. "
+                        f"Total: {total_seconds_per_iter:.4f} s/iter. "
+                        f"ETA={eta}"
+                    ),
+                    n=5,
+                )
+            start_data_time = time.perf_counter()
+    # Measure the time only for this worker (before the synchronization barrier)
+    total_time = time.perf_counter() - start_time
+    total_time_str = str(datetime.timedelta(seconds=total_time))
+    # NOTE this format is parsed by grep
+    logger.info(
+        "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
+            total_time_str, total_time / (total - num_warmup), num_devices
+        )
+    )
+    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
+    logger.info(
+        "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
+            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
+        )
+    )
+    if distributed:
+        comm.synchronize()
+        inference_json = comm.gather(inference_json, dst=0)
+        inference_json = list(itertools.chain(*inference_json))
+        if not comm.is_main_process():
+            return []
+    return inference_json
+class Omni3DEvaluator(COCOEvaluator):
+    def __init__(
+        self,
+        dataset_name,
+        tasks=None,
+        distributed=True,
+        output_dir=None,
+        *,
+        max_dets_per_image=None,
+        use_fast_impl=False,
+        eval_prox=False,
+        only_2d=False,
+        filter_settings={},
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have either the following corresponding metadata:
+                    "json_file": the path to the COCO format annotation
+                Or it must be in detectron2's standard dataset format
+                so it can be converted to COCO format automatically.
+            tasks (tuple[str]): tasks that can be evaluated under the given
+                configuration. For now, support only for "bbox".
+            distributed (True): if True, will collect results from all ranks and run evaluation
+                in the main process.
+                Otherwise, will only evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump all
+                results predicted on the dataset. The dump contains two files:
+                1. "instances_predictions.pth" a file that can be loaded with `torch.load` and
+                    contains all the results in the format they are produced by the model.
+                2. "coco_instances_results.json" a json file in COCO's result format.
+            max_dets_per_image (int): limit on the maximum number of detections per image.
+                By default in COCO, this limit is to 100, but this can be customized
+                to be greater, as is needed in evaluation metrics AP fixed and AP pool
+                (see https://arxiv.org/pdf/2102.01066.pdf)
+                This doesn't affect keypoint evaluation.
+            use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
+                Although the results should be very close to the official implementation in COCO
+                API, it is still recommended to compute results with the official API for use in
+                papers. The faster implementation also uses more RAM.
+            eval_prox (bool): whether to perform proximity evaluation. For datasets that are not
+                exhaustively annotated.
+            only_2d (bool): evaluates only 2D performance if set to True
+            filter_settions: settings for the dataset loader. TBD
+        """
+        self._logger = logging.getLogger(__name__)
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._use_fast_impl = use_fast_impl
+        self._eval_prox = eval_prox
+        self._only_2d = only_2d
+        self._filter_settings = filter_settings
+        # COCOeval requires the limit on the number of detections per image (maxDets) to be a list
+        # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the
+        # 3rd element (100) is used as the limit on the number of detections per image when
+        # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval,
+        # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults.
+        if max_dets_per_image is None:
+            max_dets_per_image = [1, 10, 100]
+        else:
+            max_dets_per_image = [1, 10, max_dets_per_image]
+        self._max_dets_per_image = max_dets_per_image
+        self._tasks = tasks
+        self._cpu_device = torch.device("cpu")
+        self._metadata = MetadataCatalog.get(dataset_name)
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._omni_api = Omni3D([json_file], filter_settings)
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the COCO evaluation server).
+        self._do_evaluation = "annotations" in self._omni_api.dataset
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        # Optional image keys to keep when available
+        img_keys_optional = ["p2"]
+        for input, output in zip(inputs, outputs):
+            prediction = {
+                "image_id": input["image_id"],
+                "K": input["K"],
+                "width": input["width"],
+                "height": input["height"],
+            }
+            # store optional keys when available
+            for img_key in img_keys_optional:
+                if img_key in input:
+                    prediction.update({img_key: input[img_key]})
+            # already in COCO format
+            if type(output["instances"]) == list:
+                prediction["instances"] = output["instances"]
+            # tensor instances format
+            else:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(
+                    instances, input["image_id"]
+                )
+            if len(prediction) > 1:
+                self._predictions.append(prediction)
+    def _derive_omni_results(self, omni_eval, iou_type, mode, class_names=None):
+        """
+        Derive the desired score numbers from summarized COCOeval.
+        Args:
+            omni_eval (None or Omni3Deval): None represents no predictions from model.
+            iou_type (str):
+            mode (str): either "2D" or "3D"
+            class_names (None or list[str]): if provided, will use it to predict
+                per-category AP.
+        Returns:
+            a dict of {metric name: score}
+        """
+        assert mode in ["2D", "3D"]
+        metrics = {
+            "2D": ["AP", "AP50", "AP75", "AP95", "APs", "APm", "APl"],
+            "3D": ["AP", "AP15", "AP25", "AP50", "APn", "APm", "APf"],
+        }[mode]
+        if iou_type != "bbox":
+            raise ValueError("Support only for bbox evaluation.")
+        if omni_eval is None:
+            self._logger.warn("No predictions from the model!")
+            return {metric: float("nan") for metric in metrics}
+        # the standard metrics
+        results = {
+            metric: float(
+                omni_eval.stats[idx] * 100 if omni_eval.stats[idx] >= 0 else "nan"
+            )
+            for idx, metric in enumerate(metrics)
+        }
+        self._logger.info(
+            "Evaluation results for {} in {} mode: \n".format(iou_type, mode)
+            + create_small_table(results)
+        )
+        if not np.isfinite(sum(results.values())):
+            self._logger.info("Some metrics cannot be computed and is shown as NaN.")
+        if class_names is None or len(class_names) <= 1:
+            return results
+        # Compute per-category AP
+        # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
+        precisions = omni_eval.eval["precision"]
+        # precision has dims (iou, recall, cls, area range, max dets)
+        assert len(class_names) == precisions.shape[2]
+        results_per_category = []
+        for idx, name in enumerate(class_names):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float("nan")
+            results_per_category.append(("{}".format(name), float(ap * 100)))
+        # tabulate it
+        N_COLS = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        results_table = itertools.zip_longest(
+            *[results_flatten[i::N_COLS] for i in range(N_COLS)]
+        )
+        table = tabulate(
+            results_table,
+            tablefmt="pipe",
+            floatfmt=".3f",
+            headers=["category", "AP"] * (N_COLS // 2),
+            numalign="left",
+        )
+        self._logger.info(
+            "Per-category {} AP in {} mode: \n".format(iou_type, mode) + table
+        )
+        results.update({"AP-" + name: ap for name, ap in results_per_category})
+        return results
+    def _eval_predictions(self, predictions, img_ids=None):
+        """
+        Evaluate predictions. Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        omni_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        tasks = self._tasks or self._tasks_from_predictions(omni_results)
+        omni3d_global_categories = MetadataCatalog.get('omni3d_model').thing_classes
+        # the dataset results will store only the categories that are present
+        # in the corresponding dataset, all others will be dropped.
+        dataset_results = []
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            dataset_id_to_contiguous_id = (
+                self._metadata.thing_dataset_id_to_contiguous_id
+            )
+            all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
+            num_classes = len(all_contiguous_ids)
+            assert (
+                min(all_contiguous_ids) == 0
+                and max(all_contiguous_ids) == num_classes - 1
+            )
+            reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
+            for result in omni_results:
+                category_id = result["category_id"]
+                assert category_id < num_classes, (
+                    f"A prediction has class={category_id}, "
+                    f"but the dataset only has {num_classes} classes and "
+                    f"predicted class id should be in [0, {num_classes - 1}]."
+                )
+                result["category_id"] = reverse_id_mapping[category_id]
+                cat_name = omni3d_global_categories[category_id]
+                if cat_name in self._metadata.thing_classes:
+                    dataset_results.append(result)
+        # replace the results with the filtered
+        # instances that are in vocabulary.
+        omni_results = dataset_results
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "omni_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(omni_results))
+                f.flush()
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info(
+            "Evaluating predictions with {} COCO API...".format(
+                "unofficial" if self._use_fast_impl else "official"
+            )
+        )
+        for task in sorted(tasks):
+            assert task in {"bbox"}, f"Got unknown task: {task}!"
+            evals, log_strs = (
+                _evaluate_predictions_on_omni(
+                    self._omni_api,
+                    omni_results,
+                    task,
+                    img_ids=img_ids,
+                    only_2d=self._only_2d,
+                    eval_prox=self._eval_prox,
+                )
+                if len(omni_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+            modes = evals.keys()
+            for mode in modes:
+                res = self._derive_omni_results(
+                    evals[mode],
+                    task,
+                    mode,
+                    class_names=self._metadata.get("thing_classes"),
+                )
+                self._results[task + "_" + format(mode)] = res
+                self._results[task + "_" + format(mode) + '_evalImgs'] = evals[mode].evalImgs
+                self._results[task + "_" + format(mode) + '_evals_per_cat_area'] = evals[mode].evals_per_cat_area
+            self._results["log_str_2D"] = log_strs["2D"]
+            if "3D" in log_strs:
+                self._results["log_str_3D"] = log_strs["3D"]
+def _evaluate_predictions_on_omni(
+    omni_gt,
+    omni_results,
+    iou_type,
+    img_ids=None,
+    only_2d=False,
+    eval_prox=False,
+):
+    """
+    Evaluate the coco results using COCOEval API.
+    """
+    assert len(omni_results) > 0
+    log_strs, evals = {}, {}
+    omni_dt = omni_gt.loadRes(omni_results)
+    modes = ["2D"] if only_2d else ["2D", "3D"]
+    for mode in modes:
+        omni_eval = Omni3Deval(
+            omni_gt, omni_dt, iouType=iou_type, mode=mode, eval_prox=eval_prox
+        )
+        if img_ids is not None:
+            omni_eval.params.imgIds = img_ids
+        omni_eval.evaluate()
+        omni_eval.accumulate()
+        log_str = omni_eval.summarize()
+        log_strs[mode] = log_str
+        evals[mode] = omni_eval
+    return evals, log_strs
+def instances_to_coco_json(instances, img_id):
+    num_instances = len(instances)
+    if num_instances == 0:
+        return []
+    boxes = BoxMode.convert(
+        instances.pred_boxes.tensor.numpy(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+    ).tolist()
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+    if hasattr(instances, "pred_bbox3D"):
+        bbox3D = instances.pred_bbox3D.tolist()
+        center_cam = instances.pred_center_cam.tolist()
+        center_2D = instances.pred_center_2D.tolist()
+        dimensions = instances.pred_dimensions.tolist()
+        pose = instances.pred_pose.tolist()
+    else:
+        # dummy
+        bbox3D = np.ones([num_instances, 8, 3]).tolist()
+        center_cam = np.ones([num_instances, 3]).tolist()
+        center_2D = np.ones([num_instances, 2]).tolist()
+        dimensions = np.ones([num_instances, 3]).tolist()
+        pose = np.ones([num_instances, 3, 3]).tolist()
+    results = []
+    for k in range(num_instances):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "bbox": boxes[k],
+            "score": scores[k],
+            "depth": np.array(bbox3D[k])[:, 2].mean(),
+            "bbox3D": bbox3D[k],
+            "center_cam": center_cam[k],
+            "center_2D": center_2D[k],
+            "dimensions": dimensions[k],
+            "pose": pose[k],
+        }
+        results.append(result)
+    return results
+# ---------------------------------------------------------------------
+#                               Omni3DParams
+# ---------------------------------------------------------------------
+class Omni3DParams:
+    """
+    Params for the Omni evaluation API
+    """
+    def setDet2DParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(
+            0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True
+        )
+        self.recThrs = np.linspace(
+            0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True
+        )
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [
+            [0 ** 2, 1e5 ** 2],
+            [0 ** 2, 32 ** 2],
+            [32 ** 2, 96 ** 2],
+            [96 ** 2, 1e5 ** 2],
+        ]
+        self.areaRngLbl = ["all", "small", "medium", "large"]
+        self.useCats = 1
+    def setDet3DParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(
+            0.05, 0.5, int(np.round((0.5 - 0.05) / 0.05)) + 1, endpoint=True
+        )
+        self.recThrs = np.linspace(
+            0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True
+        )
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [[0, 1e5], [0, 10], [10, 35], [35, 1e5]]
+        self.areaRngLbl = ["all", "near", "medium", "far"]
+        self.useCats = 1
+    def __init__(self, mode="2D"):
+        """
+        Args:
+            iouType (str): defines 2D or 3D evaluation parameters.
+                One of {"2D", "3D"}
+        """
+        if mode == "2D":
+            self.setDet2DParams()
+        elif mode == "3D":
+            self.setDet3DParams()
+        else:
+            raise Exception("mode %s not supported" % (mode))
+        self.iouType = "bbox"
+        self.mode = mode
+        # the proximity threshold defines the neighborhood
+        # when evaluating on non-exhaustively annotated datasets
+        self.proximity_thresh = 0.3
+# ---------------------------------------------------------------------
+#                               Omni3Deval
+# ---------------------------------------------------------------------
+class Omni3Deval(COCOeval):
+    """
+    Wraps COCOeval for 2D or 3D box evaluation depending on mode
+    """
+    def __init__(
+        self, cocoGt=None, cocoDt=None, iouType="bbox", mode="2D", eval_prox=False
+    ):
+        """
+        Initialize COCOeval using coco APIs for Gt and Dt
+        Args:
+            cocoGt: COCO object with ground truth annotations
+            cocoDt: COCO object with detection results
+            iouType: (str) defines the evaluation type. Supports only "bbox" now.
+            mode: (str) defines whether to evaluate 2D or 3D performance.
+                One of {"2D", "3D"}
+            eval_prox: (bool) if True, performs "Proximity Evaluation", i.e.
+                evaluates detections in the proximity of the ground truth2D boxes.
+                This is used for datasets which are not exhaustively annotated.
+        """
+        if not iouType:
+            print("iouType not specified. use default iouType bbox")
+        elif iouType != "bbox":
+            print("no support for %s iouType" % (iouType))
+        self.mode = mode
+        if mode not in ["2D", "3D"]:
+            raise Exception("mode %s not supported" % (mode))
+        self.eval_prox = eval_prox
+        self.cocoGt = cocoGt  # ground truth COCO API
+        self.cocoDt = cocoDt  # detections COCO API
+        # per-image per-category evaluation results [KxAxI] elements
+        self.evalImgs = defaultdict(list)
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Omni3DParams(mode)  # parameters
+        self._paramsEval = {}  # parameters for evaluation
+        self.stats = []  # result summarization
+        self.ious = {}  # ious between all gts and dts
+        if cocoGt is not None:
+            self.params.imgIds = sorted(cocoGt.getImgIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+        self.evals_per_cat_area = None
+    def _prepare(self):
+        """
+        Prepare ._gts and ._dts for evaluation based on params
+        """
+        p = self.params
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+        # set ignore flag
+        ignore_flag = "ignore2D" if self.mode == "2D" else "ignore3D"
+        for gt in gts:
+            gt[ignore_flag] = gt[ignore_flag] if ignore_flag in gt else 0
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        for gt in gts:
+            self._gts[gt["image_id"], gt["category_id"]].append(gt)
+        for dt in dts:
+            self._dts[dt["image_id"], dt["category_id"]].append(dt)
+        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+    def accumulate(self, p = None):
+        '''
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        '''
+        print('Accumulating evaluation results...')
+        assert self.evalImgs, 'Please run evaluate() first'
+        tic = time.time()
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T           = len(p.iouThrs)
+        R           = len(p.recThrs)
+        K           = len(p.catIds) if p.useCats else 1
+        A           = len(p.areaRng)
+        M           = len(p.maxDets)
+        precision   = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
+        recall      = -np.ones((T,K,A,M))
+        scores      = -np.ones((T,R,K,A,M))
+        # create dictionary for future indexing
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.imgIds)
+        # get inds to evaluate
+        catid_list = [k for n, k in enumerate(p.catIds)  if k in setK]
+        k_list = [n for n, k in enumerate(p.catIds)  if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
+        i_list = [n for n, i in enumerate(p.imgIds)  if i in setI]
+        I0 = len(_pe.imgIds)
+        A0 = len(_pe.areaRng)
+        has_precomputed_evals = not (self.evals_per_cat_area is None)
+        if has_precomputed_evals:
+            evals_per_cat_area = self.evals_per_cat_area
+        else:
+            evals_per_cat_area = {}
+        # retrieve E at each category, area range, and max number of detections
+        for k, (k0, catId) in enumerate(zip(k_list, catid_list)):
+            Nk = k0*A0*I0
+            for a, a0 in enumerate(a_list):
+                Na = a0*I0
+                if has_precomputed_evals:
+                    E = evals_per_cat_area[(catId, a)]
+                else:
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if not e is None]
+                    evals_per_cat_area[(catId, a)] = E
+                if len(E) == 0:
+                    continue
+                for m, maxDet in enumerate(m_list):
+                    dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
+                    # different sorting method generates slightly different results.
+                    # mergesort is used to be consistent as Matlab implementation.
+                    inds = np.argsort(-dtScores, kind='mergesort')
+                    dtScoresSorted = dtScores[inds]
+                    dtm  = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
+                    dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet]  for e in E], axis=1)[:,inds]
+                    gtIg = np.concatenate([e['gtIgnore'] for e in E])
+                    npig = np.count_nonzero(gtIg==0)
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(               dtm,  np.logical_not(dtIg) )
+                    fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp+tp+np.spacing(1))
+                        q  = np.zeros((R,))
+                        ss = np.zeros((R,))
+                        if nd:
+                            recall[t,k,a,m] = rc[-1]
+                        else:
+                            recall[t,k,a,m] = 0
+                        # numpy is slow without cython optimization for accessing elements
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist(); q = q.tolist()
+                        for i in range(nd-1, 0, -1):
+                            if pr[i] > pr[i-1]:
+                                pr[i-1] = pr[i]
+                        inds = np.searchsorted(rc, p.recThrs, side='left')
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                                ss[ri] = dtScoresSorted[pi]
+                        except:
+                            pass
+                        precision[t,:,k,a,m] = np.array(q)
+                        scores[t,:,k,a,m] = np.array(ss)
+        self.evals_per_cat_area = evals_per_cat_area
+        self.eval = {
+            'params': p,
+            'counts': [T, R, K, A, M],
+            'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            'precision': precision,
+            'recall':   recall,
+            'scores': scores,
+        }
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format( toc-tic))
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+        """
+        print("Running per image evaluation...")
+        p = self.params
+        print("Evaluate annotation type *{}*".format(p.iouType))
+        tic = time.time()
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+        self._prepare()
+        catIds = p.catIds if p.useCats else [-1]
+        # loop through images, area range, max detection number
+        self.ious = {
+            (imgId, catId): self.computeIoU(imgId, catId)
+            for imgId in p.imgIds
+            for catId in catIds
+        }
+        maxDet = p.maxDets[-1]
+        self.evalImgs = [
+            self.evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId in catIds
+            for areaRng in p.areaRng
+            for imgId in p.imgIds
+        ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print("DONE (t={:0.2f}s).".format(toc - tic))
+    def computeIoU(self, imgId, catId):
+        """
+        ComputeIoU computes the IoUs by sorting based on "score"
+        for either 2D boxes (in 2D mode) or 3D boxes (in 3D mode)
+        """
+        device = (torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu"))
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+        if p.iouType == "bbox":
+            if self.mode == "2D":
+                g = [g["bbox"] for g in gt]
+                d = [d["bbox"] for d in dt]
+            elif self.mode == "3D":
+                g = [g["bbox3D"] for g in gt]
+                d = [d["bbox3D"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+        # compute iou between each dt and gt region
+        # iscrowd is required in builtin maskUtils so we
+        # use a dummy buffer for it
+        iscrowd = [0 for o in gt]
+        if self.mode == "2D":
+            ious = maskUtils.iou(d, g, iscrowd)
+        elif len(d) > 0 and len(g) > 0:
+            # For 3D eval, we want to run IoU in CUDA if available
+            if torch.cuda.is_available() and len(d) * len(g) < MAX_DTS_CROSS_GTS_FOR_IOU3D:
+                device = torch.device("cuda:0")
+            else:
+                device = torch.device("cpu")
+            dd = torch.tensor(d, device=device, dtype=torch.float32)
+            gg = torch.tensor(g, device=device, dtype=torch.float32)
+            ious = box3d_overlap(dd, gg).cpu().numpy()
+        else:
+            ious = []
+        in_prox = None
+        if self.eval_prox:
+            g = [g["bbox"] for g in gt]
+            d = [d["bbox"] for d in dt]
+            iscrowd = [0 for o in gt]
+            ious2d = maskUtils.iou(d, g, iscrowd)
+            if type(ious2d) == list:
+                in_prox = []
+            else:
+                in_prox = ious2d > p.proximity_thresh
+        return ious, in_prox
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        """
+        Perform evaluation for single category and image
+        Returns:
+            dict (single image results)
+        """
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+        flag_range = "area" if self.mode == "2D" else "depth"
+        flag_ignore = "ignore2D" if self.mode == "2D" else "ignore3D"
+        for g in gt:
+            if g[flag_ignore] or (g[flag_range] < aRng[0] or g[flag_range] > aRng[1]):
+                g["_ignore"] = 1
+            else:
+                g["_ignore"] = 0
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g["_ignore"] for g in gt], kind="mergesort")
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        # load computed ious
+        ious = (
+            self.ious[imgId, catId][0][:, gtind]
+            if len(self.ious[imgId, catId][0]) > 0
+            else self.ious[imgId, catId][0]
+        )
+        if self.eval_prox:
+            in_prox = (
+                self.ious[imgId, catId][1][:, gtind]
+                if len(self.ious[imgId, catId][1]) > 0
+                else self.ious[imgId, catId][1]
+            )
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g["_ignore"] for g in gt])
+        dtIg = np.zeros((T, D))
+        if not len(ious) == 0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, g in enumerate(gt):
+                        # in case of proximity evaluation, if not in proximity continue
+                        if self.eval_prox and not in_prox[dind, gind]:
+                            continue
+                        # if this gt already matched, continue
+                        if gtm[tind, gind] > 0:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind, gind] < iou:
+                            continue
+                        # if match successful and best so far, store appropriately
+                        iou = ious[dind, gind]
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]["id"]
+                    gtm[tind, m] = d["id"]
+        # set unmatched detections outside of area range to ignore
+        a = np.array(
+            [d[flag_range] < aRng[0] or d[flag_range] > aRng[1] for d in dt]
+        ).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, 0)))
+        # in case of proximity evaluation, ignore detections which are far from gt regions
+        if self.eval_prox and len(in_prox) > 0:
+            dt_far = in_prox.any(1) == 0
+            dtIg = np.logical_or(dtIg, np.repeat(dt_far.reshape((1, len(dt))), T, 0))
+        # store results for given image and category
+        return {
+            "image_id": imgId,
+            "category_id": catId,
+            "aRng": aRng,
+            "maxDet": maxDet,
+            "dtIds": [d["id"] for d in dt],
+            "gtIds": [g["id"] for g in gt],
+            "dtMatches": dtm,
+            "gtMatches": gtm,
+            "dtScores": [d["score"] for d in dt],
+            "gtIgnore": gtIg,
+            "dtIgnore": dtIg,
+        }
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results.
+        Note this functin can *only* be applied on the default parameter setting
+        """
+        def _summarize(mode, ap=1, iouThr=None, areaRng="all", maxDets=100, log_str=""):
+            p = self.params
+            eval = self.eval
+            if mode == "2D":
+                iStr = (" {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}")
+            elif mode == "3D":
+                iStr = " {:<18} {} @[ IoU={:<9} | depth={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+            titleStr = "Average Precision" if ap == 1 else "Average Recall"
+            typeStr = "(AP)" if ap == 1 else "(AR)"
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = eval["precision"]
+                # IoU
+                if iouThr is not None:
+                    t = np.where(np.isclose(iouThr, p.iouThrs.astype(float)))[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = eval["recall"]
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            if log_str != "":
+                log_str += "\n"
+            log_str += "mode={} ".format(mode) + \
+                iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)
+            return mean_s, log_str
+        def _summarizeDets(mode):
+            params = self.params
+            # the thresholds here, define the thresholds printed in `derive_omni_results`
+            thres = [0.5, 0.75, 0.95] if mode == "2D" else [0.15, 0.25, 0.50]
+            stats = np.zeros((13,))
+            stats[0], log_str = _summarize(mode, 1)
+            stats[1], log_str = _summarize(
+                mode, 1, iouThr=thres[0], maxDets=params.maxDets[2], log_str=log_str
+            )
+            stats[2], log_str = _summarize(
+                mode, 1, iouThr=thres[1], maxDets=params.maxDets[2], log_str=log_str
+            )
+            stats[3], log_str = _summarize(
+                mode, 1, iouThr=thres[2], maxDets=params.maxDets[2], log_str=log_str
+            )
+            stats[4], log_str = _summarize(
+                mode,
+                1,
+                areaRng=params.areaRngLbl[1],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+            stats[5], log_str = _summarize(
+                mode,
+                1,
+                areaRng=params.areaRngLbl[2],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+            stats[6], log_str = _summarize(
+                mode,
+                1,
+                areaRng=params.areaRngLbl[3],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+            stats[7], log_str = _summarize(
+                mode, 0, maxDets=params.maxDets[0], log_str=log_str
+            )
+            stats[8], log_str = _summarize(
+                mode, 0, maxDets=params.maxDets[1], log_str=log_str
+            )
+            stats[9], log_str = _summarize(
+                mode, 0, maxDets=params.maxDets[2], log_str=log_str
+            )
+            stats[10], log_str = _summarize(
+                mode,
+                0,
+                areaRng=params.areaRngLbl[1],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+            stats[11], log_str = _summarize(
+                mode,
+                0,
+                areaRng=params.areaRngLbl[2],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+            stats[12], log_str = _summarize(
+                mode,
+                0,
+                areaRng=params.areaRngLbl[3],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+            return stats, log_str
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+        stats, log_str = _summarizeDets(self.mode)
+        self.stats = stats
+        return log_str

cubercnn/modeling/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .densenet import *
+from .mnasnet import *
+from .resnet import *
+from .shufflenet import *
+from .dla import *

cubercnn/modeling/backbone/densenet.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+from torchvision import models
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+import torch.nn.functional as F
+from detectron2.modeling.backbone.fpn import FPN
+class DenseNetBackbone(Backbone):
+    def __init__(self, cfg, input_shape, pretrained=True):
+        super().__init__()
+        base  = models.densenet121(pretrained)
+        base  = base.features
+        self.base = base
+        self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 1024, 'p6': 1024}
+        self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
+        self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
+    def forward(self, x):
+        outputs = {}
+        db1 = self.base[0:5](x)
+        db2 = self.base[5:7](db1)
+        db3 = self.base[7:9](db2)
+        p5 = self.base[9:](db3)
+        p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
+        outputs['p2'] = db1
+        outputs['p3'] = db2
+        outputs['p4'] = db3
+        outputs['p5'] = p5
+        outputs['p6'] = p6
+        return outputs
+@BACKBONE_REGISTRY.register()
+def build_densenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
+    bottom_up = DenseNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE
+    )
+    return backbone

cubercnn/modeling/backbone/dla.py ADDED Viewed

	@@ -0,0 +1,507 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+import os
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+import torch.nn.functional as F
+import detectron2.utils.comm as comm
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+from detectron2.modeling.backbone.fpn import FPN
+BatchNorm = nn.BatchNorm2d
+"""
+Adapted models from repositories
+    Deep Layer Aggregation CVPR 2018
+    https://github.com/ucbdrive/dla
+    BSD-3 Licence https://github.com/ucbdrive/dla/blob/master/LICENSE
+    Geometry Uncertainty Projection Network for Monocular 3D Object Detection, ICCV 2021
+    https://github.com/SuperMHP/GUPNet/blob/main/code/lib/backbones/dla.py
+    MIT Licence https://github.com/SuperMHP/GUPNet/blob/main/LICENSE
+"""
+def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
+    return os.path.join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn1 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = BatchNorm(planes)
+        self.stride = stride
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 2
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out += residual
+        out = self.relu(out)
+        return out
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation, bias=False,
+                               dilation=dilation, groups=cardinality)
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out += residual
+        out = self.relu(out)
+        return out
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1,
+            stride=1, bias=False, padding=(kernel_size - 1) // 2)
+        self.bn = BatchNorm(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+        return x
+class Tree(nn.Module):
+    def __init__(self, levels, block, in_channels, out_channels, stride=1,
+                 level_root=False, root_dim=0, root_kernel_size=1,
+                 dilation=1, root_residual=False):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride,
+                               dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1,
+                               dilation=dilation)
+        else:
+            self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
+                              stride, root_dim=0,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+            self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
+                              root_dim=root_dim + out_channels,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size,
+                             root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels,
+                          kernel_size=1, stride=1, bias=False),
+                BatchNorm(out_channels)
+            )
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+class DLA(nn.Module):
+    def __init__(self, levels, channels, num_classes=1000,
+                 block=BasicBlock, residual_root=False, return_levels=False,
+                 pool_size=7, linear_root=False):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.return_levels = return_levels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                      padding=3, bias=False),
+            BatchNorm(channels[0]),
+            nn.ReLU(inplace=True))
+        self.level0 = self._make_conv_level(
+            channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+        self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
+                           level_root=False,
+                           root_residual=residual_root)
+        self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
+                           level_root=True, root_residual=residual_root)
+        self.avgpool = nn.AvgPool2d(pool_size)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes,
+                          kernel_size=1, stride=1, bias=False),
+                BatchNorm(planes),
+            )
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+        return nn.Sequential(*layers)
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(inplanes, planes, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                BatchNorm(planes),
+                nn.ReLU(inplace=True)])
+            inplanes = planes
+        return nn.Sequential(*modules)
+    def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
+        # load model only on main process
+        # to prevent redundent model caching
+        if comm.is_main_process():
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+            del model_weights['fc.weight']
+            del model_weights['fc.bias']
+            self.load_state_dict(model_weights)
+def dla34(pretrained=False, tricks=False, **kwargs):  # DLA-34
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 128, 256, 512],
+                block=BasicBlock, **kwargs)
+    if pretrained:
+        if tricks:
+            model.load_pretrained_model(data='imagenet', name='dla34+tricks', hash='24a49e58')
+        else:
+            model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
+    return model
+def dla46_c(pretrained=False, **kwargs):  # DLA-46-C
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=Bottleneck, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla46_c', hash='2bfd52c3')
+    return model
+def dla46x_c(pretrained=False, **kwargs):  # DLA-X-46-C
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=BottleneckX, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla46x_c', hash='d761bae7')
+    return model
+def dla60x_c(pretrained=False, **kwargs):  # DLA-X-60-C
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=BottleneckX, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla60x_c', hash='b870c45c')
+    return model
+def dla60(pretrained=False, tricks=False, **kwargs):  # DLA-60
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, **kwargs)
+    if pretrained:
+        if tricks:
+            model.load_pretrained_model(data='imagenet', name='dla60+tricks', hash='14488826')
+        else:
+            model.load_pretrained_model(data='imagenet', name='dla60', hash='24839fc4')
+    return model
+def dla60x(pretrained=False, **kwargs):  # DLA-X-60
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla60x', hash='d15cacda')
+    return model
+def dla102(pretrained=False, tricks=False, **kwargs):  # DLA-102
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, residual_root=True, **kwargs)
+    if pretrained:
+        if tricks:
+            model.load_pretrained_model(data='imagenet', name='dla102+tricks', hash='27a30eac')
+        else:
+            model.load_pretrained_model(data='imagenet', name='dla102', hash='d94d9790')
+    return model
+def dla102x(pretrained=False, **kwargs):  # DLA-X-102
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, residual_root=True, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla102x', hash='ad62be81')
+    return model
+def dla102x2(pretrained=False, **kwargs):  # DLA-X-102 64
+    BottleneckX.cardinality = 64
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, residual_root=True, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla102x2', hash='262837b6')
+    return model
+def dla169(pretrained=False, **kwargs):  # DLA-169
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, residual_root=True, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla169', hash='0914e092')
+    return model
+class DLABackbone(Backbone):
+    def __init__(self, cfg, input_shape, pretrained=True):
+        super().__init__()
+        if cfg.MODEL.DLA.TYPE == 'dla34':
+            base  = dla34(pretrained=pretrained, tricks=cfg.MODEL.DLA.TRICKS)
+            self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
+        elif cfg.MODEL.DLA.TYPE == 'dla46_c':
+            base  = dla46_c(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 64, 'p3': 64, 'p4': 128, 'p5': 256, 'p6': 256}
+        elif cfg.MODEL.DLA.TYPE == 'dla46x_c':
+            base  = dla46x_c(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 64, 'p3': 64, 'p4': 128, 'p5': 256, 'p6': 256}
+        elif cfg.MODEL.DLA.TYPE == 'dla60x_c':
+            base  = dla60x_c(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 64, 'p3': 64, 'p4': 128, 'p5': 256, 'p6': 256}
+        elif cfg.MODEL.DLA.TYPE == 'dla60':
+            base  = dla60(pretrained=pretrained, tricks=cfg.MODEL.DLA.TRICKS)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        elif cfg.MODEL.DLA.TYPE == 'dla60x':
+            base  = dla60x(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        elif cfg.MODEL.DLA.TYPE == 'dla102':
+            base  = dla102(pretrained=pretrained, tricks=cfg.MODEL.DLA.TRICKS)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        elif cfg.MODEL.DLA.TYPE == 'dla102x':
+            base  = dla102x(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        elif cfg.MODEL.DLA.TYPE == 'dla102x2':
+            base  = dla102x2(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        elif cfg.MODEL.DLA.TYPE == 'dla169':
+            base  = dla169(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        self.base_layer = base.base_layer
+        self.level0 = base.level0
+        self.level1 = base.level1
+        self.level2 = base.level2
+        self.level3 = base.level3
+        self.level4 = base.level4
+        self.level5 = base.level5
+        self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
+        self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
+    def forward(self, x):
+        outputs = {}
+        base_layer = self.base_layer(x)
+        level0 = self.level0(base_layer)
+        level1 = self.level1(level0)
+        level2 = self.level2(level1)
+        level3 = self.level3(level2)
+        level4 = self.level4(level3)
+        level5 = self.level5(level4)
+        level6 = F.max_pool2d(level5, kernel_size=1, stride=2, padding=0)
+        outputs['p2'] = level2
+        outputs['p3'] = level3
+        outputs['p4'] = level4
+        outputs['p5'] = level5
+        outputs['p6'] = level6
+        return outputs
+@BACKBONE_REGISTRY.register()
+def build_dla_from_vision_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
+    bottom_up = DLABackbone(cfg, input_shape, pretrained=imagenet_pretrain)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

cubercnn/modeling/backbone/mnasnet.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+from torchvision import models
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+import torch.nn.functional as F
+from detectron2.modeling.backbone.fpn import FPN
+class MNASNetBackbone(Backbone):
+    def __init__(self, cfg, input_shape, pretrained=True):
+        super().__init__()
+        base  = models.mnasnet1_0(pretrained)
+        base  = base.layers
+        self.base = base
+        self._out_feature_channels = {'p2': 24, 'p3': 40, 'p4': 96, 'p5': 320, 'p6': 320}
+        self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
+        self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
+    def forward(self, x):
+        outputs = {}
+        p2 = self.base[0:9](x)
+        p3 = self.base[9](p2)
+        p4 = self.base[10:12](p3)
+        p5 = self.base[12:14](p4)
+        p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
+        outputs['p2'] = p2
+        outputs['p3'] = p3
+        outputs['p4'] = p4
+        outputs['p5'] = p5
+        outputs['p6'] = p6
+        return outputs
+@BACKBONE_REGISTRY.register()
+def build_mnasnet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
+    bottom_up = MNASNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

cubercnn/modeling/backbone/resnet.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+from torchvision import models
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.fpn import LastLevelMaxPool
+from detectron2.modeling.backbone.resnet import build_resnet_backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+import torch.nn.functional as F
+from detectron2.modeling.backbone.fpn import FPN
+class ResNet(Backbone):
+    def __init__(self, cfg, input_shape, pretrained=True):
+        super().__init__()
+        if cfg.MODEL.RESNETS.DEPTH == 18:
+            base  = models.resnet18(pretrained)
+            self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
+        elif cfg.MODEL.RESNETS.DEPTH == 34:
+            base  = models.resnet34(pretrained)
+            self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
+        elif cfg.MODEL.RESNETS.DEPTH == 50:
+            base  = models.resnet50(pretrained)
+            self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048}
+        elif cfg.MODEL.RESNETS.DEPTH == 101:
+            base  = models.resnet101(pretrained)
+            self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048}
+        else:
+            raise ValueError('No configuration currently supporting depth of {}'.format(cfg.MODEL.RESNETS.DEPTH))
+        self.conv1 = base.conv1
+        self.bn1 = base.bn1
+        self.relu = base.relu
+        self.maxpool = base.maxpool
+        self.layer1 = base.layer1
+        self.layer2 = base.layer2
+        self.layer3 = base.layer3
+        self.layer4 = base.layer4
+        self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
+        self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
+    def forward(self, x):
+        outputs = {}
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        p2 = self.layer1(x)
+        p3 = self.layer2(p2)
+        p4 = self.layer3(p3)
+        p5 = self.layer4(p4)
+        p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
+        outputs['p2'] = p2
+        outputs['p3'] = p3
+        outputs['p4'] = p4
+        outputs['p5'] = p5
+        outputs['p6'] = p6
+        return outputs
+@BACKBONE_REGISTRY.register()
+def build_resnet_from_vision_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
+    if cfg.MODEL.RESNETS.TORCHVISION:
+        bottom_up = ResNet(cfg, input_shape, pretrained=imagenet_pretrain)
+    else:
+        # use the MSRA modeling logic to build the backbone.
+        bottom_up = build_resnet_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

cubercnn/modeling/backbone/shufflenet.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+from torchvision import models
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+import torch.nn.functional as F
+from detectron2.modeling.backbone.fpn import FPN
+class ShufflenetBackbone(Backbone):
+    def __init__(self, cfg, input_shape, pretrained=True):
+        super().__init__()
+        base  = models.shufflenet_v2_x1_0(pretrained)
+        self.conv1 = base.conv1
+        self.maxpool = base.maxpool
+        self.stage2 = base.stage2
+        self.stage3 = base.stage3
+        self.stage4 = base.stage4
+        self.conv5 = base.conv5
+        self._out_feature_channels = {'p2': 24, 'p3': 116, 'p4': 232, 'p5': 464, 'p6': 464}
+        self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
+        self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
+    def forward(self, x):
+        outputs = {}
+        x = self.conv1(x)
+        p2 = self.maxpool(x)
+        p3 = self.stage2(p2)
+        p4 = self.stage3(p3)
+        p5 = self.stage4(p4)
+        p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
+        outputs['p2'] = p2
+        outputs['p3'] = p3
+        outputs['p4'] = p4
+        outputs['p5'] = p5
+        outputs['p6'] = p6
+        return outputs
+@BACKBONE_REGISTRY.register()
+def build_shufflenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
+    bottom_up = ShufflenetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

cubercnn/modeling/meta_arch/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .rcnn3d import *

cubercnn/modeling/meta_arch/rcnn3d.py ADDED Viewed

	@@ -0,0 +1,618 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+import logging
+from typing import Dict, List, Optional
+from detectron2.layers import move_device_like
+from detectron2.structures.image_list import ImageList
+import torch
+import numpy as np
+from detectron2.layers import ShapeSpec, batched_nms
+from detectron2.utils.visualizer import Visualizer
+from detectron2.data.detection_utils import convert_image_to_rgb
+from detectron2.structures import Instances
+from detectron2.utils.events import get_event_storage
+from detectron2.data import MetadataCatalog
+from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY
+from detectron2.modeling.proposal_generator import build_proposal_generator
+from detectron2.utils.logger import _log_api_usage
+from detectron2.modeling.meta_arch import (
+    META_ARCH_REGISTRY, GeneralizedRCNN
+)
+# from cubercnn.data.generate_depth_maps import setup_depth_model
+from cubercnn.modeling.roi_heads import build_roi_heads
+from detectron2.data import MetadataCatalog
+from cubercnn.modeling.roi_heads import build_roi_heads
+from cubercnn import util, vis
+import torch.nn.functional as F
+from detectron2.config import configurable
+import torch.nn as nn
+logger = logging.getLogger(__name__)
+@META_ARCH_REGISTRY.register()
+class RCNN3D(GeneralizedRCNN):
+    @classmethod
+    def from_config(cls, cfg, priors=None):
+        backbone = build_backbone(cfg, priors=priors)
+        return {
+            "backbone": backbone,
+            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
+            "roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors),
+            "input_format": cfg.INPUT.FORMAT,
+            "vis_period": cfg.VIS_PERIOD,
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+        }
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        if not self.training:
+            return self.inference(batched_inputs)
+        images = self.preprocess_image(batched_inputs)
+        # scaling factor for the sample relative to its original scale
+        # e.g., how much has the image been upsampled by? or downsampled?
+        im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
+        # The unmodified intrinsics for the image
+        Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+        # the backbone is actually a FPN, where the DLA model is the bottom-up structure.
+        # FPN: https://arxiv.org/abs/1612.03144v2
+        # backbone and proposal generator only work on 2D images and annotations.
+        features = self.backbone(images.tensor)
+        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        instances, detector_losses = self.roi_heads(
+            images, features, proposals,
+            Ks, im_scales_ratio,
+            gt_instances
+        )
+        if self.vis_period > 0:
+            storage = get_event_storage()
+            if storage.iter % self.vis_period == 0 and storage.iter > 0:
+                self.visualize_training(batched_inputs, proposals, instances)
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+    def inference(
+        self,
+        batched_inputs: List[Dict[str, torch.Tensor]],
+        detected_instances: Optional[List[Instances]] = None,
+        do_postprocess: bool = True,
+    ):
+        assert not self.training
+        images = self.preprocess_image(batched_inputs)
+        # scaling factor for the sample relative to its original scale
+        # e.g., how much has the image been upsampled by? or downsampled?
+        im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
+        # The unmodified intrinsics for the image
+        Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
+        features = self.backbone(images.tensor)
+        # Pass oracle 2D boxes into the RoI heads
+        if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
+            oracles = [b['oracle2D'] for b in batched_inputs]
+            results, _ = self.roi_heads(images, features, oracles, Ks, im_scales_ratio, None)
+        # normal inference
+        else:
+            proposals, _ = self.proposal_generator(images, features, None)
+            results, _ = self.roi_heads(images, features, proposals, Ks, im_scales_ratio, None)
+        if do_postprocess:
+            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
+            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
+        else:
+            return results
+    def visualize_training(self, batched_inputs, proposals, instances):
+        """
+        A function used to visualize images and proposals. It shows ground truth
+        bounding boxes on the original image and up to 20 top-scoring predicted
+        object proposals on the original image. Users can implement different
+        visualization functions for different models.
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            proposals (list): a list that contains predicted proposals. Both
+                batched_inputs and proposals should have the same length.
+            instances (list): a list that contains predicted RoIhead instances. Both
+                batched_inputs and proposals should have the same length.
+        """
+        storage = get_event_storage()
+        # minimum number of boxes to try to visualize per image
+        max_vis_prop = 20
+        if not hasattr(self, 'thing_classes'):
+            self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+            self.num_classes = len(self.thing_classes)
+        # make a dummy for 2d scenario
+        only2d = instances is None
+        if only2d:
+            instances = [None]*len(batched_inputs)
+        for input, prop, instances_i in zip(batched_inputs, proposals, instances):
+            img = input["image"]
+            img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
+            img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
+            img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
+            '''
+            Visualize the 2D GT and proposal predictions
+            '''
+            v_gt = Visualizer(img, None)
+            v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
+            anno_img = v_gt.get_image()
+            box_size = min(len(prop.proposal_boxes), max_vis_prop)
+            v_pred = Visualizer(img, None)
+            v_pred = v_pred.overlay_instances(
+                boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
+            )
+            prop_img = v_pred.get_image()
+            vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
+            vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
+            storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
+            if only2d:
+                break
+            '''
+            Visualize the 3D GT and predictions
+            '''
+            K = torch.tensor(input['K'], device=self.device)
+            scale = input['height']/img.shape[0]
+            fx, sx = (val.item()/scale for val in K[0, [0, 2]])
+            fy, sy = (val.item()/scale for val in K[1, [1, 2]])
+            K_scaled = torch.tensor(
+                [[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]],
+                dtype=torch.float32, device=self.device
+            ) @ K
+            gts_per_image = input["instances"]
+            gt_classes = gts_per_image.gt_classes
+            # Filter out irrelevant groundtruth
+            fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)
+            gt_classes = gt_classes[fg_selection_mask]
+            gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
+            gt_boxes   = gts_per_image.gt_boxes.tensor[fg_selection_mask]  # 2D boxes
+            gt_poses   = gts_per_image.gt_poses[fg_selection_mask]         # GT poses
+            # projected 2D center, depth, w, h, l, 3D center
+            gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]
+            # this box may have been mirrored and scaled so
+            # we need to recompute XYZ in 3D by backprojecting.
+            gt_z = gt_boxes3D[:, 2]
+            gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
+            gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy
+            # put together the GT boxes
+            gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
+            gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)
+            gt_colors = torch.tensor(
+                [util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))],
+                device=self.device
+            )/255.0
+            gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)
+            # perform a simple NMS, which is not cls dependent.
+            keep = batched_nms(
+                instances_i.pred_boxes.tensor,
+                instances_i.scores,
+                torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device),
+                self.roi_heads.box_predictor.test_nms_thresh
+            )
+            keep = keep[:max_vis_prop]
+            num_to_visualize = len(keep)
+            pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
+            pred_pose = instances_i.pred_pose[keep]
+            pred_colors = torch.tensor(
+                [util.get_color(i) for i in range(num_to_visualize)],
+                device=self.device
+            )/255.0
+            pred_boxes = instances_i.pred_boxes[keep]
+            pred_scores = instances_i.scores[keep]
+            pred_classes = instances_i.pred_classes[keep]
+            pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
+            pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)
+            # convert to lists
+            pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
+            gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]
+            img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
+            img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
+            # horizontal stack 3D GT and pred left/right
+            vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
+            vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
+            vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)
+            storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)
+            break  # only visualize one image in a batch
+@META_ARCH_REGISTRY.register()
+class RCNN3D_combined_features(nn.Module):
+    @configurable
+    def __init__(self, *, backbone, proposal_generator, roi_heads, input_format, vis_period, pixel_mean, pixel_std, depth_model, only_2d):
+        super().__init__()
+        self.backbone = backbone
+        self.proposal_generator = proposal_generator
+        self.roi_heads = roi_heads
+        self.input_format = input_format
+        self.vis_period = vis_period
+        self.depth_model = depth_model
+        self.only_2d = only_2d
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+        assert (
+            self.pixel_mean.shape == self.pixel_std.shape
+        ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
+    @classmethod
+    def from_config(cls, cfg, priors=None):
+        backbone = build_backbone(cfg, priors=priors)
+        if False: # some leftover from experimenting with incorporating depth features
+            depth_model = 'zoedepth'
+            pretrained_resource = 'local::depth/checkpoints/depth_anything_metric_depth_indoor.pt'
+            d_model = setup_depth_model(depth_model, pretrained_resource) #NOTE maybe make the depth model be learnable as well
+            shape_modified = {key:ShapeSpec(i.channels*2,stride=i.stride) for key, i in backbone.output_shape().items()}
+        else:
+            d_model = None
+            shape_modified = backbone.output_shape()
+        return {
+            "backbone": backbone,
+            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
+            "roi_heads": build_roi_heads(cfg, shape_modified, priors=priors),
+            "input_format": cfg.INPUT.FORMAT,
+            "vis_period": cfg.VIS_PERIOD,
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+            "depth_model": d_model,
+            "only_2d": cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D == 0.0,
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def _move_to_current_device(self, x):
+        return move_device_like(x, self.pixel_mean)
+    def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]], normalise=True, img_type="image", convert=False, NoOp=False, to_float=False):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self._move_to_current_device(x[img_type]) for x in batched_inputs]
+        if normalise:
+            images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        if convert:
+            # convert from BGR to RGB
+            images = [x[[2,1,0],:,:] for x in images]
+        if to_float:
+            images = [x.float()/255.0 for x in images]
+        if NoOp:
+            images = ImageList.from_tensors(images)
+            return images
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
+        return images
+    def _standardize(self, x:torch.Tensor, y:torch.Tensor):
+        '''standardise x to match the mean and std of y'''
+        ym = y.mean()
+        ys = y.std()
+        xm = x.mean()
+        xs = x.std()
+        return (x - xm) * (ys / xs) + ym
+    def cat_depth_features(self, features, images_raw):
+        pred_o = self.depth_model(images_raw.tensor.float()/255.0)
+        # depth features corresponding to p2, p3, p4, p5
+        d_features = pred_o['depth_features']
+        # img_features = features['p5']
+        # we must scale the depth map to the same size as the conv feature, otherwise the scale will not correspond correctly in the roi pooling
+        for (layer, img_feature), d_feature in zip(features.items(), reversed(d_features)):
+            d_feature = F.interpolate(d_feature, size=img_feature.shape[-2:], mode='bilinear', align_corners=True)
+            d_feature = self._standardize(d_feature, img_feature)
+            features[layer] = torch.cat((img_feature, d_feature), dim=1)
+        return features
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        if not self.training:
+            return self.inference(batched_inputs) # segmentor is just none in inference because we dont need the loss
+        images = self.preprocess_image(batched_inputs)
+        # NOTE: images_raw are scaled to be padded to the same size as the largest.
+        # This is necessary because the images are of different sizes, so to batch them they must each be the same size.
+        images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
+        # if we want depth maps they are there
+        if not self.only_2d:
+            depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True)
+            ground_maps_fail = [i['ground_map'] is None for i in batched_inputs]
+            ground_maps_fail_idx = [i for i, x in enumerate(ground_maps_fail) if x]
+            for idx in ground_maps_fail_idx:
+                batched_inputs[idx]['ground_map'] = torch.tensor([[1]]) # make a dummy to indicate a fail
+            ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True)
+        else:
+            ground_maps = None
+            depth_maps = None
+        # scaling factor for the sample relative to its original scale
+        # e.g., how much has the image been upsampled by? or downsampled?
+        im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
+        # The unmodified intrinsics for the image
+        Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        features = self.backbone(images.tensor)
+        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        if self.depth_model is not None:
+            features = self.cat_depth_features(features, images_raw)
+        instances, detector_losses = self.roi_heads(
+            images, images_raw, ground_maps, depth_maps, features, proposals,
+            Ks, im_scales_ratio,
+            gt_instances
+        )
+        if self.vis_period > 0:
+            storage = get_event_storage()
+            if storage.iter % self.vis_period == 0 and storage.iter > 0:
+                self.visualize_training(batched_inputs, proposals, instances)
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+    def inference(
+        self,
+        batched_inputs: List[Dict[str, torch.Tensor]],
+        detected_instances: Optional[List[Instances]] = None,
+        do_postprocess: bool = True,
+    ):
+        assert not self.training
+        images = self.preprocess_image(batched_inputs)
+        images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
+        # do we assume no access to ground maps in inference?
+        ground_maps = None
+        depth_maps = None
+        # scaling factor for the sample relative to its original scale
+        # e.g., how much has the image been upsampled by? or downsampled?
+        im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
+        # The unmodified intrinsics for the image
+        Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
+        features = self.backbone(images.tensor)
+        # Pass oracle 2D boxes into the RoI heads
+        if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
+            oracles = [b['oracle2D'] for b in batched_inputs]
+            results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, oracles, Ks, im_scales_ratio, None)
+        # normal inference
+        else:
+            proposals, _ = self.proposal_generator(images, features, None)
+            if self.depth_model is not None:
+                features = self.cat_depth_features(features, images_raw)
+            # pred boxes are proposals
+            results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, proposals, Ks, im_scales_ratio, None)
+        if do_postprocess:
+            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
+            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
+        else:
+            return results
+    def visualize_training(self, batched_inputs, proposals, instances):
+        """
+        A function used to visualize images and proposals. It shows ground truth
+        bounding boxes on the original image and up to 20 top-scoring predicted
+        object proposals on the original image. Users can implement different
+        visualization functions for different models.
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            proposals (list): a list that contains predicted proposals. Both
+                batched_inputs and proposals should have the same length.
+            instances (list): a list that contains predicted RoIhead instances. Both
+                batched_inputs and proposals should have the same length.
+        """
+        storage = get_event_storage()
+        # minimum number of boxes to try to visualize per image
+        max_vis_prop = 20
+        if not hasattr(self, 'thing_classes'):
+            self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+            self.num_classes = len(self.thing_classes)
+        only2d = instances is None
+        if only2d:
+            instances = [None]*len(batched_inputs)
+        for input, prop, instances_i in zip(batched_inputs, proposals, instances):
+            img = input["image"]
+            img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
+            img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
+            img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
+            '''
+            Visualize the 2D GT and proposal predictions
+            '''
+            v_gt = Visualizer(img, None)
+            v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
+            anno_img = v_gt.get_image()
+            box_size = min(len(prop.proposal_boxes), max_vis_prop)
+            v_pred = Visualizer(img, None)
+            v_pred = v_pred.overlay_instances(
+                boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
+            )
+            prop_img = v_pred.get_image()
+            vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
+            vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
+            storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
+            if only2d:
+                break
+            '''
+            Visualize the 3D GT and predictions
+            '''
+            K = torch.tensor(input['K'], device=self.device)
+            scale = input['height']/img.shape[0]
+            fx, sx = (val.item()/scale for val in K[0, [0, 2]])
+            fy, sy = (val.item()/scale for val in K[1, [1, 2]])
+            K_scaled = torch.tensor(
+                [[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]],
+                dtype=torch.float32, device=self.device
+            ) @ K
+            gts_per_image = input["instances"]
+            gt_classes = gts_per_image.gt_classes
+            # Filter out irrelevant groundtruth
+            fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)
+            gt_classes = gt_classes[fg_selection_mask]
+            gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
+            gt_boxes   = gts_per_image.gt_boxes.tensor[fg_selection_mask]  # 2D boxes
+            gt_poses   = gts_per_image.gt_poses[fg_selection_mask]         # GT poses
+            # projected 2D center, depth, w, h, l, 3D center
+            gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]
+            # this box may have been mirrored and scaled so
+            # we need to recompute XYZ in 3D by backprojecting.
+            gt_z = gt_boxes3D[:, 2]
+            gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
+            gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy
+            # put together the GT boxes
+            gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
+            gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)
+            gt_colors = torch.tensor(
+                [util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))],
+                device=self.device
+            )/255.0
+            gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)
+            # perform a simple NMS, which is not cls dependent.
+            keep = batched_nms(
+                instances_i.pred_boxes.tensor,
+                instances_i.scores,
+                torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device),
+                self.roi_heads.box_predictor.test_nms_thresh
+            )
+            keep = keep[:max_vis_prop]
+            num_to_visualize = len(keep)
+            pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
+            pred_pose = instances_i.pred_pose[keep]
+            pred_colors = torch.tensor(
+                [util.get_color(i) for i in range(num_to_visualize)],
+                device=self.device
+            )/255.0
+            pred_boxes = instances_i.pred_boxes[keep]
+            pred_scores = instances_i.scores[keep]
+            pred_classes = instances_i.pred_classes[keep]
+            pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
+            pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)
+            # convert to lists
+            pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
+            gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]
+            img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
+            img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
+            # horizontal stack 3D GT and pred left/right
+            vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
+            vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
+            vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)
+            storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)
+            break  # only visualize one image in a batch
+def build_model(cfg, priors=None):
+    """
+    Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
+    Note that it does not load any weights from ``cfg``.
+    """
+    meta_arch = cfg.MODEL.META_ARCHITECTURE
+    model = META_ARCH_REGISTRY.get(meta_arch)(cfg, priors=priors)
+    model.to(torch.device(cfg.MODEL.DEVICE))
+    _log_api_usage("modeling.meta_arch." + meta_arch)
+    return model
+def build_backbone(cfg, input_shape=None, priors=None):
+    """
+    Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
+    Returns:
+        an instance of :class:`Backbone`
+    """
+    if input_shape is None:
+        input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
+    backbone_name = cfg.MODEL.BACKBONE.NAME
+    backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape, priors)
+    assert isinstance(backbone, Backbone)
+    return backbone