AndreasLH commited on
Commit
db3da1e
·
1 Parent(s): dc15a2b
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +28 -0
  2. .gitmodules +9 -0
  3. .vscode/launch.json +100 -0
  4. DATA.md +219 -0
  5. Dockerfile +32 -0
  6. LICENSE.md +906 -0
  7. MODEL_ZOO.md +17 -0
  8. ProposalNetwork/utils/__init__.py +3 -0
  9. ProposalNetwork/utils/conversions.py +50 -0
  10. ProposalNetwork/utils/plane.py +209 -0
  11. ProposalNetwork/utils/spaces.py +328 -0
  12. ProposalNetwork/utils/utils.py +564 -0
  13. README.md +4 -5
  14. VisualiseGT.py +830 -0
  15. app.py +155 -0
  16. configs/Base.yaml +89 -0
  17. configs/Base_Omni3D.yaml +18 -0
  18. configs/Base_Omni3D_2D_only.yaml +20 -0
  19. configs/Base_Omni3D_in.yaml +18 -0
  20. configs/Base_Omni3D_og.yaml +18 -0
  21. configs/Base_Omni3D_out.yaml +18 -0
  22. configs/Base_Omni3D_prof.yaml +18 -0
  23. configs/Omni_combined.yaml +37 -0
  24. configs/category_meta.json +1 -0
  25. configs/cubercnn_DLA34_FPN.yaml +6 -0
  26. configs/cubercnn_ResNet34_FPN.yaml +7 -0
  27. configs/cubercnn_densenet_FPN.yaml +4 -0
  28. configs/cubercnn_mnasnet_FPN.yaml +4 -0
  29. configs/cubercnn_shufflenet_FPN.yaml +4 -0
  30. cubercnn/config/__init__.py +1 -0
  31. cubercnn/config/config.py +187 -0
  32. cubercnn/data/Omni_to_kitti.py +197 -0
  33. cubercnn/data/__init__.py +5 -0
  34. cubercnn/data/build.py +260 -0
  35. cubercnn/data/builtin.py +46 -0
  36. cubercnn/data/dataset_mapper.py +272 -0
  37. cubercnn/data/datasets.py +480 -0
  38. cubercnn/data/filter_ground.py +26 -0
  39. cubercnn/data/generate_depth_maps.py +86 -0
  40. cubercnn/data/generate_ground_segmentations.py +206 -0
  41. cubercnn/evaluation/__init__.py +1 -0
  42. cubercnn/evaluation/omni3d_evaluation.py +1706 -0
  43. cubercnn/modeling/backbone/__init__.py +5 -0
  44. cubercnn/modeling/backbone/densenet.py +64 -0
  45. cubercnn/modeling/backbone/dla.py +507 -0
  46. cubercnn/modeling/backbone/mnasnet.py +63 -0
  47. cubercnn/modeling/backbone/resnet.py +96 -0
  48. cubercnn/modeling/backbone/shufflenet.py +69 -0
  49. cubercnn/modeling/meta_arch/__init__.py +1 -0
  50. cubercnn/modeling/meta_arch/rcnn3d.py +618 -0
.gitignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # folders or files
2
+ detectron2/
3
+ pytorch3d/
4
+ datasets/*
5
+ testing/image_2
6
+ training/image_2
7
+ # .vscode/
8
+ .ipynb_checkpoints/
9
+ .idea/
10
+ output/
11
+ cubercnn/external/
12
+ wandb/
13
+ hpc_logs/
14
+ depth/checkpoints/
15
+ ProposalNetwork/proposals/network_out.pkl
16
+ .vscode/settings.json
17
+ submit.sh
18
+ profiling/
19
+
20
+ # filetypes
21
+ *.pyc
22
+ *.mexa64
23
+ */output/*
24
+ */output*/*
25
+ *~
26
+ *.so
27
+ #*.ipynb
28
+ ProposalNetwork/proposals/figs/*
.gitmodules ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [submodule "GroundingDINO"]
2
+ path = GroundingDINO
3
+ url = https://github.com/AndreasLH/GroundingDINO
4
+ [submodule "sam-hq"]
5
+ path = sam-hq
6
+ url = https://github.com/SysCV/sam-hq.git
7
+ [submodule "Depth-Anything-V2"]
8
+ path = Depth-Anything-V2
9
+ url = https://github.com/DepthAnything/Depth-Anything-V2
.vscode/launch.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+
8
+ {
9
+ "name": "Python: Current File",
10
+ "type": "python" ,
11
+ "request": "launch",
12
+ "program": "${file}",
13
+ "console": "integratedTerminal",
14
+ "justMyCode": true,
15
+ "args": []
16
+ },
17
+ {
18
+ "name": "Cube R-CNN Demo",
19
+ "type": "python",
20
+ "request": "launch",
21
+ "program": "demo/demo.py",
22
+ "console": "integratedTerminal",
23
+ "justMyCode": true,
24
+ "args": ["--config-file", "cubercnn://omni3d/cubercnn_DLA34_FPN.yaml", "--input-folder", "datasets/title", "--threshold", "0.25", "MODEL.WEIGHTS", "cubercnn://omni3d/cubercnn_DLA34_FPN.pth", "OUTPUT_DIR", "output/demo"]
25
+ },
26
+ {
27
+ "name": "Cube R-CNN 2D only",
28
+ "type": "python",
29
+ "request": "launch",
30
+ "program": "tools/train_net.py",
31
+ "console": "integratedTerminal",
32
+ "justMyCode": true,
33
+ "args": ["--config-file", "configs/Base_Omni3D_2D_only.yaml", "MODEL.WEIGHTS", "output/omni3d-2d-only/model_recent.pth", "OUTPUT_DIR", "output/omni3d-2d-only", "log", "False"]
34
+ },
35
+ {
36
+ "name": "Cube R-CNN Time equalised Demo",
37
+ "type": "python",
38
+ "request": "launch",
39
+ "program": "demo/demo.py",
40
+ "console": "integratedTerminal",
41
+ "justMyCode": true,
42
+ "args": ["--config-file", "configs/Base_Omni3D.yaml", "--input-folder", "datasets/coco_examples", "--threshold", "0.25", "MODEL.WEIGHTS", "output/omni_equalised/model_final.pth", "OUTPUT_DIR", "output/demo_time_equal"]
43
+ },
44
+ {
45
+ "name": "Cube R-CNN pseudo gt demo",
46
+ "type": "python",
47
+ "request": "launch",
48
+ "program": "demo/demo.py",
49
+ "console": "integratedTerminal",
50
+ "justMyCode": true,
51
+ "args": ["--config-file", "configs/Base_Omni3D.yaml", "--input-folder", "datasets/title", "--threshold", "0.25", "MODEL.WEIGHTS", "output/omni_pseudo_gt/model_final.pth", "OUTPUT_DIR", "output/demo_pseudogt"]
52
+ },
53
+ {
54
+ "name": "train",
55
+ "type": "python",
56
+ "request": "launch",
57
+ "program": "tools/train_net.py",
58
+ "console": "integratedTerminal",
59
+ "justMyCode": true,
60
+ "args": ["--config-file", "configs/Base_Omni3D.yaml", "OUTPUT_DIR", "output/omni3d_example_run"]
61
+ },
62
+ {
63
+ "name": "resume train",
64
+ "type": "python",
65
+ "request": "launch",
66
+ "program": "tools/train_net.py",
67
+ "console": "integratedTerminal",
68
+ "justMyCode": true,
69
+ "args": ["--config-file", "configs/Base_Omni3D.yaml", "--resume", "OUTPUT_DIR", "output/Baseline_sgd"]
70
+ },
71
+ {
72
+ "name": "eval, train_net pretrained",
73
+ "type": "python",
74
+ "request": "launch",
75
+ "program": "tools/train_net.py",
76
+ "console": "integratedTerminal",
77
+ "justMyCode": true,
78
+ "args": ["--eval-only", "--config-file", "cubercnn://omni3d/cubercnn_DLA34_FPN.yaml", "MODEL.WEIGHTS", "cubercnn://omni3d/cubercnn_DLA34_FPN.pth"]
79
+ },
80
+ {
81
+ "name": "eval, train_net locally trained",
82
+ "type": "python",
83
+ "request": "launch",
84
+ "program": "tools/train_net.py",
85
+ "console": "integratedTerminal",
86
+ "justMyCode": true,
87
+ "args": ["--eval-only", "--config-file", "configs/Base_Omni3D.yaml", "MODEL.WEIGHTS", "output/Baseline_sgd/model_final.pth"]
88
+ },
89
+ {
90
+ "name": "train Cube R-CNN weak loss",
91
+ "type": "python",
92
+ "request": "launch",
93
+ "program": "tools/train_net.py",
94
+ "console": "integratedTerminal",
95
+ "justMyCode": true,
96
+ "args": ["--config-file", "configs/Omni_combined.yaml", "OUTPUT_DIR", "output/omni3d_combined_test", "log", "False", "loss_functions", "['iou', 'z_pseudo_gt_center', 'pose_alignment', 'pose_ground']"]
97
+ },
98
+
99
+ ]
100
+ }
DATA.md ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - [Data Preparation](#data-preparation)
2
+ - [Download Omni3D json](#download-omni3d-json)
3
+ - [Download Individual Datasets](#download-individual-datasets)
4
+ - [Data Usage](#data-usage)
5
+ - [Coordinate System](#coordinate-system)
6
+ - [Annotation Format](#annotation-format)
7
+ - [Example Loading Data](#example-loading-data)
8
+
9
+ # Data Preparation
10
+
11
+ The Omni3D dataset is comprised of 6 datasets which have been pre-processed into the same annotation format and camera coordinate systems. To use a subset or the full dataset you must download:
12
+
13
+ 1. The processed Omni3D json files
14
+ 2. RGB images from each dataset separately
15
+
16
+ ## Download Omni3D json
17
+
18
+ Run
19
+
20
+ ```
21
+ sh datasets/Omni3D/download_omni3d_json.sh
22
+ ```
23
+
24
+ to download and extract the Omni3D train, val and test json annotation files.
25
+
26
+ ## Download Individual Datasets
27
+
28
+ Below are the instructions for setting up each individual dataset. It is recommended to download only the data you plan to use.
29
+
30
+ ### KITTI
31
+ Download the left color images from [KITTI's official website](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). Unzip or softlink the images into the root `./Omni3D/` which should have the folder structure as detailed below. Note that we only require the image_2 folder.
32
+
33
+ ```bash
34
+ datasets/KITTI_object
35
+ └── training
36
+ ├── image_2
37
+ ```
38
+
39
+
40
+ ### nuScenes
41
+
42
+ Download the trainval images from the [official nuScenes website](https://www.nuscenes.org/nuscenes#download). Unzip or softlink the images into the root `./Omni3D/` which should have the folder structure as detailed below. Note that we only require the CAM_FRONT folder.
43
+
44
+ ```bash
45
+ datasets/nuScenes/samples
46
+ └── samples
47
+ ├── CAM_FRONT
48
+ ```
49
+
50
+ ### Objectron
51
+
52
+ Run
53
+
54
+ ```
55
+ sh datasets/objectron/download_objectron_images.sh
56
+ ```
57
+
58
+ to download and extract the Objectron pre-processed images (~24 GB).
59
+
60
+ ### SUN RGB-D
61
+
62
+ Download the "SUNRGBD V1" images at [SUN RGB-D's official website](https://rgbd.cs.princeton.edu/). Unzip or softlink the images into the root `./Omni3D/` which should have the folder structure as detailed below.
63
+
64
+ ```bash
65
+ ./Omni3D/datasets/SUNRGBD
66
+ ├── kv1
67
+ ├── kv2
68
+ ├── realsense
69
+ ```
70
+
71
+ ### ARKitScenes
72
+
73
+ Run
74
+
75
+ ```
76
+ sh datasets/ARKitScenes/download_arkitscenes_images.sh
77
+ ```
78
+
79
+ to download and extract the ARKitScenes pre-processed images (~28 GB).
80
+
81
+ ### Hypersim
82
+
83
+ Follow the [download instructions](https://github.com/apple/ml-hypersim/tree/main/contrib/99991) from [Thomas Germer](https://github.com/99991) in order to download all \*tonemap.jpg preview images in order to avoid downloading the full Hypersim dataset. For example:
84
+
85
+ ```bash
86
+ git clone https://github.com/apple/ml-hypersim
87
+ cd ml-hypersim/
88
+ python contrib/99991/download.py -c .tonemap.jpg -d /path/to/Omni3D/datasets/hypersim --silent
89
+ ```
90
+
91
+ Then arrange or unzip the downloaded images into the root `./Omni3D/` so that it has the below folder structure.
92
+
93
+ ```bash
94
+ datasets/hypersim/
95
+ ├── ai_001_001
96
+ ├── ai_001_002
97
+ ├── ai_001_003
98
+ ├── ai_001_004
99
+ ├── ai_001_005
100
+ ├── ai_001_006
101
+ ...
102
+ ```
103
+
104
+ # Data Usage
105
+
106
+ Below we describe the unified 3D annotation coordinate systems, annotation format, and an example script.
107
+
108
+
109
+ ## Coordinate System
110
+
111
+ All 3D annotations are provided in a shared camera coordinate system with
112
+ +x right, +y down, +z toward screen.
113
+
114
+ The vertex order of bbox3D_cam:
115
+ ```
116
+ v4_____________________v5
117
+ /| /|
118
+ / | / |
119
+ / | / |
120
+ /___|_________________/ |
121
+ v0| | |v1 |
122
+ | | | |
123
+ | | | |
124
+ | | | |
125
+ | |_________________|___|
126
+ | / v7 | /v6
127
+ | / | /
128
+ | / | /
129
+ |/_____________________|/
130
+ v3 v2
131
+ ```
132
+
133
+ ## Annotation Format
134
+ Each dataset is formatted as a dict in python in the below format.
135
+
136
+ ```python
137
+ dataset {
138
+ "info" : info,
139
+ "images" : [image],
140
+ "categories" : [category],
141
+ "annotations" : [object],
142
+ }
143
+
144
+ info {
145
+ "id" : str,
146
+ "source" : int,
147
+ "name" : str,
148
+ "split" : str,
149
+ "version" : str,
150
+ "url" : str,
151
+ }
152
+
153
+ image {
154
+ "id" : int,
155
+ "dataset_id" : int,
156
+ "width" : int,
157
+ "height" : int,
158
+ "file_path" : str,
159
+ "K" : list (3x3),
160
+ "src_90_rotate" : int, # im was rotated X times, 90 deg counterclockwise
161
+ "src_flagged" : bool, # flagged as potentially inconsistent sky direction
162
+ }
163
+
164
+ category {
165
+ "id" : int,
166
+ "name" : str,
167
+ "supercategory" : str
168
+ }
169
+
170
+ object {
171
+
172
+ "id" : int, # unique annotation identifier
173
+ "image_id" : int, # identifier for image
174
+ "category_id" : int, # identifier for the category
175
+ "category_name" : str, # plain name for the category
176
+
177
+ # General 2D/3D Box Parameters.
178
+ # Values are set to -1 when unavailable.
179
+ "valid3D" : bool, # flag for no reliable 3D box
180
+ "bbox2D_tight" : [x1, y1, x2, y2], # 2D corners of annotated tight box
181
+ "bbox2D_proj" : [x1, y1, x2, y2], # 2D corners projected from bbox3D
182
+ "bbox2D_trunc" : [x1, y1, x2, y2], # 2D corners projected from bbox3D then truncated
183
+ "bbox3D_cam" : [[x1, y1, z1]...[x8, y8, z8]] # 3D corners in meters and camera coordinates
184
+ "center_cam" : [x, y, z], # 3D center in meters and camera coordinates
185
+ "dimensions" : [width, height, length], # 3D attributes for object dimensions in meters
186
+ "R_cam" : list (3x3), # 3D rotation matrix to the camera frame rotation
187
+
188
+ # Optional dataset specific properties,
189
+ # used mainly for evaluation and ignore.
190
+ # Values are set to -1 when unavailable.
191
+ "behind_camera" : bool, # a corner is behind camera
192
+ "visibility" : float, # annotated visibility 0 to 1
193
+ "truncation" : float, # computed truncation 0 to 1
194
+ "segmentation_pts" : int, # visible instance segmentation points
195
+ "lidar_pts" : int, # visible LiDAR points in the object
196
+ "depth_error" : float, # L1 of depth map and rendered object
197
+ }
198
+ ```
199
+
200
+
201
+ ## Example Loading Data
202
+ Each dataset is named as "Omni3D_{name}_{split}.json" where split can be train, val, or test.
203
+
204
+ The annotations are in a COCO-like format such that if you load the json from the Omni3D class which inherits the [COCO class](https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L70), you can use basic COCO dataset functions as demonstrated with the below code.
205
+
206
+ ```python
207
+ from cubercnn import data
208
+
209
+ dataset_paths_to_json = ['path/to/Omni3D/{name}_{split}.json', ...]
210
+
211
+ # Example 1. load all images
212
+ dataset = data.Omni3D(dataset_paths_to_json)
213
+ imgIds = dataset.getImgIds()
214
+ imgs = dataset.loadImgs(imgIds)
215
+
216
+ # Example 2. load annotations for image index 0
217
+ annIds = dataset.getAnnIds(imgIds=imgs[0]['id'])
218
+ anns = dataset.loadAnns(annIds)
219
+ ```
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base image, have to use the full version to use the git features
2
+ FROM python:3.12
3
+ # https://huggingface.co/docs/hub/spaces-sdks-docker-first-demo
4
+
5
+ # RUN apt-get install -y git
6
+
7
+ WORKDIR /code
8
+ COPY ./requirements.txt /code/requirements.txt
9
+ COPY ./pre-requirements.txt /code/pre-requirements.txt
10
+ COPY ./GroundingDINO /code/GroundingDINO
11
+ COPY ./sam-hq /code/sam-hq
12
+
13
+ RUN pip install --no-cache-dir -r /code/pre-requirements.txt
14
+ RUN pip install --no-cache-dir -r /code/requirements.txt
15
+
16
+ # Set up a new user named "user" with user ID 1000
17
+ RUN useradd -m -u 1000 user
18
+
19
+ # Switch to the "user" user
20
+ USER user
21
+
22
+ # Set home to the user's home directory
23
+ ENV HOME=/home/user \
24
+ PATH=/home/user/.local/bin:$PATH
25
+
26
+ # Set the working directory to the user's home directory
27
+ WORKDIR $HOME/app
28
+
29
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
30
+ COPY --chown=user . $HOME/app
31
+
32
+ CMD ["python", "app.py"]
LICENSE.md ADDED
@@ -0,0 +1,906 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - [Omni3D and Cube R-CNN License](#omni3d-and-cube-r-cnn-license)
2
+ - [ARKitScenes License](#arkitscenes-license)
3
+ - [Objectron License](#objectron-license)
4
+
5
+ # Omni3D and Cube R-CNN License
6
+ https://github.com/facebookresearch/omni3d
7
+ https://github.com/facebookresearch/omni3d/blob/main/LICENSE.md
8
+
9
+ Attribution-NonCommercial 4.0 International
10
+
11
+ =======================================================================
12
+
13
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
14
+ does not provide legal services or legal advice. Distribution of
15
+ Creative Commons public licenses does not create a lawyer-client or
16
+ other relationship. Creative Commons makes its licenses and related
17
+ information available on an "as-is" basis. Creative Commons gives no
18
+ warranties regarding its licenses, any material licensed under their
19
+ terms and conditions, or any related information. Creative Commons
20
+ disclaims all liability for damages resulting from their use to the
21
+ fullest extent possible.
22
+
23
+ Using Creative Commons Public Licenses
24
+
25
+ Creative Commons public licenses provide a standard set of terms and
26
+ conditions that creators and other rights holders may use to share
27
+ original works of authorship and other material subject to copyright
28
+ and certain other rights specified in the public license below. The
29
+ following considerations are for informational purposes only, are not
30
+ exhaustive, and do not form part of our licenses.
31
+
32
+ Considerations for licensors: Our public licenses are
33
+ intended for use by those authorized to give the public
34
+ permission to use material in ways otherwise restricted by
35
+ copyright and certain other rights. Our licenses are
36
+ irrevocable. Licensors should read and understand the terms
37
+ and conditions of the license they choose before applying it.
38
+ Licensors should also secure all rights necessary before
39
+ applying our licenses so that the public can reuse the
40
+ material as expected. Licensors should clearly mark any
41
+ material not subject to the license. This includes other CC-
42
+ licensed material, or material used under an exception or
43
+ limitation to copyright. More considerations for licensors:
44
+ wiki.creativecommons.org/Considerations_for_licensors
45
+
46
+ Considerations for the public: By using one of our public
47
+ licenses, a licensor grants the public permission to use the
48
+ licensed material under specified terms and conditions. If
49
+ the licensor's permission is not necessary for any reason--for
50
+ example, because of any applicable exception or limitation to
51
+ copyright--then that use is not regulated by the license. Our
52
+ licenses grant only permissions under copyright and certain
53
+ other rights that a licensor has authority to grant. Use of
54
+ the licensed material may still be restricted for other
55
+ reasons, including because others have copyright or other
56
+ rights in the material. A licensor may make special requests,
57
+ such as asking that all changes be marked or described.
58
+ Although not required by our licenses, you are encouraged to
59
+ respect those requests where reasonable. More_considerations
60
+ for the public:
61
+ wiki.creativecommons.org/Considerations_for_licensees
62
+
63
+ =======================================================================
64
+
65
+ Creative Commons Attribution-NonCommercial 4.0 International Public
66
+ License
67
+
68
+ By exercising the Licensed Rights (defined below), You accept and agree
69
+ to be bound by the terms and conditions of this Creative Commons
70
+ Attribution-NonCommercial 4.0 International Public License ("Public
71
+ License"). To the extent this Public License may be interpreted as a
72
+ contract, You are granted the Licensed Rights in consideration of Your
73
+ acceptance of these terms and conditions, and the Licensor grants You
74
+ such rights in consideration of benefits the Licensor receives from
75
+ making the Licensed Material available under these terms and
76
+ conditions.
77
+
78
+ Section 1 -- Definitions.
79
+
80
+ a. Adapted Material means material subject to Copyright and Similar
81
+ Rights that is derived from or based upon the Licensed Material
82
+ and in which the Licensed Material is translated, altered,
83
+ arranged, transformed, or otherwise modified in a manner requiring
84
+ permission under the Copyright and Similar Rights held by the
85
+ Licensor. For purposes of this Public License, where the Licensed
86
+ Material is a musical work, performance, or sound recording,
87
+ Adapted Material is always produced where the Licensed Material is
88
+ synched in timed relation with a moving image.
89
+
90
+ b. Adapter's License means the license You apply to Your Copyright
91
+ and Similar Rights in Your contributions to Adapted Material in
92
+ accordance with the terms and conditions of this Public License.
93
+
94
+ c. Copyright and Similar Rights means copyright and/or similar rights
95
+ closely related to copyright including, without limitation,
96
+ performance, broadcast, sound recording, and Sui Generis Database
97
+ Rights, without regard to how the rights are labeled or
98
+ categorized. For purposes of this Public License, the rights
99
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
100
+ Rights.
101
+ d. Effective Technological Measures means those measures that, in the
102
+ absence of proper authority, may not be circumvented under laws
103
+ fulfilling obligations under Article 11 of the WIPO Copyright
104
+ Treaty adopted on December 20, 1996, and/or similar international
105
+ agreements.
106
+
107
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
108
+ any other exception or limitation to Copyright and Similar Rights
109
+ that applies to Your use of the Licensed Material.
110
+
111
+ f. Licensed Material means the artistic or literary work, database,
112
+ or other material to which the Licensor applied this Public
113
+ License.
114
+
115
+ g. Licensed Rights means the rights granted to You subject to the
116
+ terms and conditions of this Public License, which are limited to
117
+ all Copyright and Similar Rights that apply to Your use of the
118
+ Licensed Material and that the Licensor has authority to license.
119
+
120
+ h. Licensor means the individual(s) or entity(ies) granting rights
121
+ under this Public License.
122
+
123
+ i. NonCommercial means not primarily intended for or directed towards
124
+ commercial advantage or monetary compensation. For purposes of
125
+ this Public License, the exchange of the Licensed Material for
126
+ other material subject to Copyright and Similar Rights by digital
127
+ file-sharing or similar means is NonCommercial provided there is
128
+ no payment of monetary compensation in connection with the
129
+ exchange.
130
+
131
+ j. Share means to provide material to the public by any means or
132
+ process that requires permission under the Licensed Rights, such
133
+ as reproduction, public display, public performance, distribution,
134
+ dissemination, communication, or importation, and to make material
135
+ available to the public including in ways that members of the
136
+ public may access the material from a place and at a time
137
+ individually chosen by them.
138
+
139
+ k. Sui Generis Database Rights means rights other than copyright
140
+ resulting from Directive 96/9/EC of the European Parliament and of
141
+ the Council of 11 March 1996 on the legal protection of databases,
142
+ as amended and/or succeeded, as well as other essentially
143
+ equivalent rights anywhere in the world.
144
+
145
+ l. You means the individual or entity exercising the Licensed Rights
146
+ under this Public License. Your has a corresponding meaning.
147
+
148
+ Section 2 -- Scope.
149
+
150
+ a. License grant.
151
+
152
+ 1. Subject to the terms and conditions of this Public License,
153
+ the Licensor hereby grants You a worldwide, royalty-free,
154
+ non-sublicensable, non-exclusive, irrevocable license to
155
+ exercise the Licensed Rights in the Licensed Material to:
156
+
157
+ a. reproduce and Share the Licensed Material, in whole or
158
+ in part, for NonCommercial purposes only; and
159
+
160
+ b. produce, reproduce, and Share Adapted Material for
161
+ NonCommercial purposes only.
162
+
163
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
164
+ Exceptions and Limitations apply to Your use, this Public
165
+ License does not apply, and You do not need to comply with
166
+ its terms and conditions.
167
+
168
+ 3. Term. The term of this Public License is specified in Section
169
+ 6(a).
170
+
171
+ 4. Media and formats; technical modifications allowed. The
172
+ Licensor authorizes You to exercise the Licensed Rights in
173
+ all media and formats whether now known or hereafter created,
174
+ and to make technical modifications necessary to do so. The
175
+ Licensor waives and/or agrees not to assert any right or
176
+ authority to forbid You from making technical modifications
177
+ necessary to exercise the Licensed Rights, including
178
+ technical modifications necessary to circumvent Effective
179
+ Technological Measures. For purposes of this Public License,
180
+ simply making modifications authorized by this Section 2(a)
181
+ (4) never produces Adapted Material.
182
+
183
+ 5. Downstream recipients.
184
+
185
+ a. Offer from the Licensor -- Licensed Material. Every
186
+ recipient of the Licensed Material automatically
187
+ receives an offer from the Licensor to exercise the
188
+ Licensed Rights under the terms and conditions of this
189
+ Public License.
190
+
191
+ b. No downstream restrictions. You may not offer or impose
192
+ any additional or different terms or conditions on, or
193
+ apply any Effective Technological Measures to, the
194
+ Licensed Material if doing so restricts exercise of the
195
+ Licensed Rights by any recipient of the Licensed
196
+ Material.
197
+
198
+ 6. No endorsement. Nothing in this Public License constitutes or
199
+ may be construed as permission to assert or imply that You
200
+ are, or that Your use of the Licensed Material is, connected
201
+ with, or sponsored, endorsed, or granted official status by,
202
+ the Licensor or others designated to receive attribution as
203
+ provided in Section 3(a)(1)(A)(i).
204
+
205
+ b. Other rights.
206
+
207
+ 1. Moral rights, such as the right of integrity, are not
208
+ licensed under this Public License, nor are publicity,
209
+ privacy, and/or other similar personality rights; however, to
210
+ the extent possible, the Licensor waives and/or agrees not to
211
+ assert any such rights held by the Licensor to the limited
212
+ extent necessary to allow You to exercise the Licensed
213
+ Rights, but not otherwise.
214
+
215
+ 2. Patent and trademark rights are not licensed under this
216
+ Public License.
217
+
218
+ 3. To the extent possible, the Licensor waives any right to
219
+ collect royalties from You for the exercise of the Licensed
220
+ Rights, whether directly or through a collecting society
221
+ under any voluntary or waivable statutory or compulsory
222
+ licensing scheme. In all other cases the Licensor expressly
223
+ reserves any right to collect such royalties, including when
224
+ the Licensed Material is used other than for NonCommercial
225
+ purposes.
226
+
227
+ Section 3 -- License Conditions.
228
+
229
+ Your exercise of the Licensed Rights is expressly made subject to the
230
+ following conditions.
231
+
232
+ a. Attribution.
233
+
234
+ 1. If You Share the Licensed Material (including in modified
235
+ form), You must:
236
+
237
+ a. retain the following if it is supplied by the Licensor
238
+ with the Licensed Material:
239
+
240
+ i. identification of the creator(s) of the Licensed
241
+ Material and any others designated to receive
242
+ attribution, in any reasonable manner requested by
243
+ the Licensor (including by pseudonym if
244
+ designated);
245
+
246
+ ii. a copyright notice;
247
+
248
+ iii. a notice that refers to this Public License;
249
+
250
+ iv. a notice that refers to the disclaimer of
251
+ warranties;
252
+
253
+ v. a URI or hyperlink to the Licensed Material to the
254
+ extent reasonably practicable;
255
+
256
+ b. indicate if You modified the Licensed Material and
257
+ retain an indication of any previous modifications; and
258
+
259
+ c. indicate the Licensed Material is licensed under this
260
+ Public License, and include the text of, or the URI or
261
+ hyperlink to, this Public License.
262
+
263
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
264
+ reasonable manner based on the medium, means, and context in
265
+ which You Share the Licensed Material. For example, it may be
266
+ reasonable to satisfy the conditions by providing a URI or
267
+ hyperlink to a resource that includes the required
268
+ information.
269
+
270
+ 3. If requested by the Licensor, You must remove any of the
271
+ information required by Section 3(a)(1)(A) to the extent
272
+ reasonably practicable.
273
+
274
+ 4. If You Share Adapted Material You produce, the Adapter's
275
+ License You apply must not prevent recipients of the Adapted
276
+ Material from complying with this Public License.
277
+
278
+ Section 4 -- Sui Generis Database Rights.
279
+
280
+ Where the Licensed Rights include Sui Generis Database Rights that
281
+ apply to Your use of the Licensed Material:
282
+
283
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
284
+ to extract, reuse, reproduce, and Share all or a substantial
285
+ portion of the contents of the database for NonCommercial purposes
286
+ only;
287
+
288
+ b. if You include all or a substantial portion of the database
289
+ contents in a database in which You have Sui Generis Database
290
+ Rights, then the database in which You have Sui Generis Database
291
+ Rights (but not its individual contents) is Adapted Material; and
292
+
293
+ c. You must comply with the conditions in Section 3(a) if You Share
294
+ all or a substantial portion of the contents of the database.
295
+
296
+ For the avoidance of doubt, this Section 4 supplements and does not
297
+ replace Your obligations under this Public License where the Licensed
298
+ Rights include other Copyright and Similar Rights.
299
+
300
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
301
+
302
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
303
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
304
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
305
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
306
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
307
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
308
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
309
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
310
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
311
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
312
+
313
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
314
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
315
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
316
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
317
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
318
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
319
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
320
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
321
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
322
+
323
+ c. The disclaimer of warranties and limitation of liability provided
324
+ above shall be interpreted in a manner that, to the extent
325
+ possible, most closely approximates an absolute disclaimer and
326
+ waiver of all liability.
327
+
328
+ Section 6 -- Term and Termination.
329
+
330
+ a. This Public License applies for the term of the Copyright and
331
+ Similar Rights licensed here. However, if You fail to comply with
332
+ this Public License, then Your rights under this Public License
333
+ terminate automatically.
334
+
335
+ b. Where Your right to use the Licensed Material has terminated under
336
+ Section 6(a), it reinstates:
337
+
338
+ 1. automatically as of the date the violation is cured, provided
339
+ it is cured within 30 days of Your discovery of the
340
+ violation; or
341
+
342
+ 2. upon express reinstatement by the Licensor.
343
+
344
+ For the avoidance of doubt, this Section 6(b) does not affect any
345
+ right the Licensor may have to seek remedies for Your violations
346
+ of this Public License.
347
+
348
+ c. For the avoidance of doubt, the Licensor may also offer the
349
+ Licensed Material under separate terms or conditions or stop
350
+ distributing the Licensed Material at any time; however, doing so
351
+ will not terminate this Public License.
352
+
353
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
354
+ License.
355
+
356
+ Section 7 -- Other Terms and Conditions.
357
+
358
+ a. The Licensor shall not be bound by any additional or different
359
+ terms or conditions communicated by You unless expressly agreed.
360
+
361
+ b. Any arrangements, understandings, or agreements regarding the
362
+ Licensed Material not stated herein are separate from and
363
+ independent of the terms and conditions of this Public License.
364
+
365
+ Section 8 -- Interpretation.
366
+
367
+ a. For the avoidance of doubt, this Public License does not, and
368
+ shall not be interpreted to, reduce, limit, restrict, or impose
369
+ conditions on any use of the Licensed Material that could lawfully
370
+ be made without permission under this Public License.
371
+
372
+ b. To the extent possible, if any provision of this Public License is
373
+ deemed unenforceable, it shall be automatically reformed to the
374
+ minimum extent necessary to make it enforceable. If the provision
375
+ cannot be reformed, it shall be severed from this Public License
376
+ without affecting the enforceability of the remaining terms and
377
+ conditions.
378
+
379
+ c. No term or condition of this Public License will be waived and no
380
+ failure to comply consented to unless expressly agreed to by the
381
+ Licensor.
382
+
383
+ d. Nothing in this Public License constitutes or may be interpreted
384
+ as a limitation upon, or waiver of, any privileges and immunities
385
+ that apply to the Licensor or You, including from the legal
386
+ processes of any jurisdiction or authority.
387
+
388
+ =======================================================================
389
+
390
+ Creative Commons is not a party to its public
391
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
392
+ its public licenses to material it publishes and in those instances
393
+ will be considered the “Licensor.” The text of the Creative Commons
394
+ public licenses is dedicated to the public domain under the CC0 Public
395
+ Domain Dedication. Except for the limited purpose of indicating that
396
+ material is shared under a Creative Commons public license or as
397
+ otherwise permitted by the Creative Commons policies published at
398
+ creativecommons.org/policies, Creative Commons does not authorize the
399
+ use of the trademark "Creative Commons" or any other trademark or logo
400
+ of Creative Commons without its prior written consent including,
401
+ without limitation, in connection with any unauthorized modifications
402
+ to any of its public licenses or any other arrangements,
403
+ understandings, or agreements concerning use of licensed material. For
404
+ the avoidance of doubt, this paragraph does not form part of the
405
+ public licenses.
406
+
407
+ Creative Commons may be contacted at creativecommons.org.
408
+
409
+ # ARKitScenes License
410
+ https://github.com/apple/ARKitScenes/
411
+ https://github.com/apple/ARKitScenes/blob/main/LICENSE
412
+
413
+ Attribution-NonCommercial-ShareAlike 4.0 International
414
+
415
+ =======================================================================
416
+
417
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
418
+ does not provide legal services or legal advice. Distribution of
419
+ Creative Commons public licenses does not create a lawyer-client or
420
+ other relationship. Creative Commons makes its licenses and related
421
+ information available on an "as-is" basis. Creative Commons gives no
422
+ warranties regarding its licenses, any material licensed under their
423
+ terms and conditions, or any related information. Creative Commons
424
+ disclaims all liability for damages resulting from their use to the
425
+ fullest extent possible.
426
+
427
+ Using Creative Commons Public Licenses
428
+
429
+ Creative Commons public licenses provide a standard set of terms and
430
+ conditions that creators and other rights holders may use to share
431
+ original works of authorship and other material subject to copyright
432
+ and certain other rights specified in the public license below. The
433
+ following considerations are for informational purposes only, are not
434
+ exhaustive, and do not form part of our licenses.
435
+
436
+ Considerations for licensors: Our public licenses are
437
+ intended for use by those authorized to give the public
438
+ permission to use material in ways otherwise restricted by
439
+ copyright and certain other rights. Our licenses are
440
+ irrevocable. Licensors should read and understand the terms
441
+ and conditions of the license they choose before applying it.
442
+ Licensors should also secure all rights necessary before
443
+ applying our licenses so that the public can reuse the
444
+ material as expected. Licensors should clearly mark any
445
+ material not subject to the license. This includes other CC-
446
+ licensed material, or material used under an exception or
447
+ limitation to copyright. More considerations for licensors:
448
+ wiki.creativecommons.org/Considerations_for_licensors
449
+
450
+ Considerations for the public: By using one of our public
451
+ licenses, a licensor grants the public permission to use the
452
+ licensed material under specified terms and conditions. If
453
+ the licensor's permission is not necessary for any reason--for
454
+ example, because of any applicable exception or limitation to
455
+ copyright--then that use is not regulated by the license. Our
456
+ licenses grant only permissions under copyright and certain
457
+ other rights that a licensor has authority to grant. Use of
458
+ the licensed material may still be restricted for other
459
+ reasons, including because others have copyright or other
460
+ rights in the material. A licensor may make special requests,
461
+ such as asking that all changes be marked or described.
462
+ Although not required by our licenses, you are encouraged to
463
+ respect those requests where reasonable. More considerations
464
+ for the public:
465
+ wiki.creativecommons.org/Considerations_for_licensees
466
+
467
+ =======================================================================
468
+
469
+ Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
470
+ Public License
471
+
472
+ By exercising the Licensed Rights (defined below), You accept and agree
473
+ to be bound by the terms and conditions of this Creative Commons
474
+ Attribution-NonCommercial-ShareAlike 4.0 International Public License
475
+ ("Public License"). To the extent this Public License may be
476
+ interpreted as a contract, You are granted the Licensed Rights in
477
+ consideration of Your acceptance of these terms and conditions, and the
478
+ Licensor grants You such rights in consideration of benefits the
479
+ Licensor receives from making the Licensed Material available under
480
+ these terms and conditions.
481
+
482
+
483
+ Section 1 -- Definitions.
484
+
485
+ a. Adapted Material means material subject to Copyright and Similar
486
+ Rights that is derived from or based upon the Licensed Material
487
+ and in which the Licensed Material is translated, altered,
488
+ arranged, transformed, or otherwise modified in a manner requiring
489
+ permission under the Copyright and Similar Rights held by the
490
+ Licensor. For purposes of this Public License, where the Licensed
491
+ Material is a musical work, performance, or sound recording,
492
+ Adapted Material is always produced where the Licensed Material is
493
+ synched in timed relation with a moving image.
494
+
495
+ b. Adapter's License means the license You apply to Your Copyright
496
+ and Similar Rights in Your contributions to Adapted Material in
497
+ accordance with the terms and conditions of this Public License.
498
+
499
+ c. BY-NC-SA Compatible License means a license listed at
500
+ creativecommons.org/compatiblelicenses, approved by Creative
501
+ Commons as essentially the equivalent of this Public License.
502
+
503
+ d. Copyright and Similar Rights means copyright and/or similar rights
504
+ closely related to copyright including, without limitation,
505
+ performance, broadcast, sound recording, and Sui Generis Database
506
+ Rights, without regard to how the rights are labeled or
507
+ categorized. For purposes of this Public License, the rights
508
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
509
+ Rights.
510
+
511
+ e. Effective Technological Measures means those measures that, in the
512
+ absence of proper authority, may not be circumvented under laws
513
+ fulfilling obligations under Article 11 of the WIPO Copyright
514
+ Treaty adopted on December 20, 1996, and/or similar international
515
+ agreements.
516
+
517
+ f. Exceptions and Limitations means fair use, fair dealing, and/or
518
+ any other exception or limitation to Copyright and Similar Rights
519
+ that applies to Your use of the Licensed Material.
520
+
521
+ g. License Elements means the license attributes listed in the name
522
+ of a Creative Commons Public License. The License Elements of this
523
+ Public License are Attribution, NonCommercial, and ShareAlike.
524
+
525
+ h. Licensed Material means the artistic or literary work, database,
526
+ or other material to which the Licensor applied this Public
527
+ License.
528
+
529
+ i. Licensed Rights means the rights granted to You subject to the
530
+ terms and conditions of this Public License, which are limited to
531
+ all Copyright and Similar Rights that apply to Your use of the
532
+ Licensed Material and that the Licensor has authority to license.
533
+
534
+ j. Licensor means the individual(s) or entity(ies) granting rights
535
+ under this Public License.
536
+
537
+ k. NonCommercial means not primarily intended for or directed towards
538
+ commercial advantage or monetary compensation. For purposes of
539
+ this Public License, the exchange of the Licensed Material for
540
+ other material subject to Copyright and Similar Rights by digital
541
+ file-sharing or similar means is NonCommercial provided there is
542
+ no payment of monetary compensation in connection with the
543
+ exchange.
544
+
545
+ l. Share means to provide material to the public by any means or
546
+ process that requires permission under the Licensed Rights, such
547
+ as reproduction, public display, public performance, distribution,
548
+ dissemination, communication, or importation, and to make material
549
+ available to the public including in ways that members of the
550
+ public may access the material from a place and at a time
551
+ individually chosen by them.
552
+
553
+ m. Sui Generis Database Rights means rights other than copyright
554
+ resulting from Directive 96/9/EC of the European Parliament and of
555
+ the Council of 11 March 1996 on the legal protection of databases,
556
+ as amended and/or succeeded, as well as other essentially
557
+ equivalent rights anywhere in the world.
558
+
559
+ n. You means the individual or entity exercising the Licensed Rights
560
+ under this Public License. Your has a corresponding meaning.
561
+
562
+
563
+ Section 2 -- Scope.
564
+
565
+ a. License grant.
566
+
567
+ 1. Subject to the terms and conditions of this Public License,
568
+ the Licensor hereby grants You a worldwide, royalty-free,
569
+ non-sublicensable, non-exclusive, irrevocable license to
570
+ exercise the Licensed Rights in the Licensed Material to:
571
+
572
+ a. reproduce and Share the Licensed Material, in whole or
573
+ in part, for NonCommercial purposes only; and
574
+
575
+ b. produce, reproduce, and Share Adapted Material for
576
+ NonCommercial purposes only.
577
+
578
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
579
+ Exceptions and Limitations apply to Your use, this Public
580
+ License does not apply, and You do not need to comply with
581
+ its terms and conditions.
582
+
583
+ 3. Term. The term of this Public License is specified in Section
584
+ 6(a).
585
+
586
+ 4. Media and formats; technical modifications allowed. The
587
+ Licensor authorizes You to exercise the Licensed Rights in
588
+ all media and formats whether now known or hereafter created,
589
+ and to make technical modifications necessary to do so. The
590
+ Licensor waives and/or agrees not to assert any right or
591
+ authority to forbid You from making technical modifications
592
+ necessary to exercise the Licensed Rights, including
593
+ technical modifications necessary to circumvent Effective
594
+ Technological Measures. For purposes of this Public License,
595
+ simply making modifications authorized by this Section 2(a)
596
+ (4) never produces Adapted Material.
597
+
598
+ 5. Downstream recipients.
599
+
600
+ a. Offer from the Licensor -- Licensed Material. Every
601
+ recipient of the Licensed Material automatically
602
+ receives an offer from the Licensor to exercise the
603
+ Licensed Rights under the terms and conditions of this
604
+ Public License.
605
+
606
+ b. Additional offer from the Licensor -- Adapted Material.
607
+ Every recipient of Adapted Material from You
608
+ automatically receives an offer from the Licensor to
609
+ exercise the Licensed Rights in the Adapted Material
610
+ under the conditions of the Adapter's License You apply.
611
+
612
+ c. No downstream restrictions. You may not offer or impose
613
+ any additional or different terms or conditions on, or
614
+ apply any Effective Technological Measures to, the
615
+ Licensed Material if doing so restricts exercise of the
616
+ Licensed Rights by any recipient of the Licensed
617
+ Material.
618
+
619
+ 6. No endorsement. Nothing in this Public License constitutes or
620
+ may be construed as permission to assert or imply that You
621
+ are, or that Your use of the Licensed Material is, connected
622
+ with, or sponsored, endorsed, or granted official status by,
623
+ the Licensor or others designated to receive attribution as
624
+ provided in Section 3(a)(1)(A)(i).
625
+
626
+ b. Other rights.
627
+
628
+ 1. Moral rights, such as the right of integrity, are not
629
+ licensed under this Public License, nor are publicity,
630
+ privacy, and/or other similar personality rights; however, to
631
+ the extent possible, the Licensor waives and/or agrees not to
632
+ assert any such rights held by the Licensor to the limited
633
+ extent necessary to allow You to exercise the Licensed
634
+ Rights, but not otherwise.
635
+
636
+ 2. Patent and trademark rights are not licensed under this
637
+ Public License.
638
+
639
+ 3. To the extent possible, the Licensor waives any right to
640
+ collect royalties from You for the exercise of the Licensed
641
+ Rights, whether directly or through a collecting society
642
+ under any voluntary or waivable statutory or compulsory
643
+ licensing scheme. In all other cases the Licensor expressly
644
+ reserves any right to collect such royalties, including when
645
+ the Licensed Material is used other than for NonCommercial
646
+ purposes.
647
+
648
+
649
+ Section 3 -- License Conditions.
650
+
651
+ Your exercise of the Licensed Rights is expressly made subject to the
652
+ following conditions.
653
+
654
+ a. Attribution.
655
+
656
+ 1. If You Share the Licensed Material (including in modified
657
+ form), You must:
658
+
659
+ a. retain the following if it is supplied by the Licensor
660
+ with the Licensed Material:
661
+
662
+ i. identification of the creator(s) of the Licensed
663
+ Material and any others designated to receive
664
+ attribution, in any reasonable manner requested by
665
+ the Licensor (including by pseudonym if
666
+ designated);
667
+
668
+ ii. a copyright notice;
669
+
670
+ iii. a notice that refers to this Public License;
671
+
672
+ iv. a notice that refers to the disclaimer of
673
+ warranties;
674
+
675
+ v. a URI or hyperlink to the Licensed Material to the
676
+ extent reasonably practicable;
677
+
678
+ b. indicate if You modified the Licensed Material and
679
+ retain an indication of any previous modifications; and
680
+
681
+ c. indicate the Licensed Material is licensed under this
682
+ Public License, and include the text of, or the URI or
683
+ hyperlink to, this Public License.
684
+
685
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
686
+ reasonable manner based on the medium, means, and context in
687
+ which You Share the Licensed Material. For example, it may be
688
+ reasonable to satisfy the conditions by providing a URI or
689
+ hyperlink to a resource that includes the required
690
+ information.
691
+ 3. If requested by the Licensor, You must remove any of the
692
+ information required by Section 3(a)(1)(A) to the extent
693
+ reasonably practicable.
694
+
695
+ b. ShareAlike.
696
+
697
+ In addition to the conditions in Section 3(a), if You Share
698
+ Adapted Material You produce, the following conditions also apply.
699
+
700
+ 1. The Adapter's License You apply must be a Creative Commons
701
+ license with the same License Elements, this version or
702
+ later, or a BY-NC-SA Compatible License.
703
+
704
+ 2. You must include the text of, or the URI or hyperlink to, the
705
+ Adapter's License You apply. You may satisfy this condition
706
+ in any reasonable manner based on the medium, means, and
707
+ context in which You Share Adapted Material.
708
+
709
+ 3. You may not offer or impose any additional or different terms
710
+ or conditions on, or apply any Effective Technological
711
+ Measures to, Adapted Material that restrict exercise of the
712
+ rights granted under the Adapter's License You apply.
713
+
714
+
715
+ Section 4 -- Sui Generis Database Rights.
716
+
717
+ Where the Licensed Rights include Sui Generis Database Rights that
718
+ apply to Your use of the Licensed Material:
719
+
720
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
721
+ to extract, reuse, reproduce, and Share all or a substantial
722
+ portion of the contents of the database for NonCommercial purposes
723
+ only;
724
+
725
+ b. if You include all or a substantial portion of the database
726
+ contents in a database in which You have Sui Generis Database
727
+ Rights, then the database in which You have Sui Generis Database
728
+ Rights (but not its individual contents) is Adapted Material,
729
+ including for purposes of Section 3(b); and
730
+
731
+ c. You must comply with the conditions in Section 3(a) if You Share
732
+ all or a substantial portion of the contents of the database.
733
+
734
+ For the avoidance of doubt, this Section 4 supplements and does not
735
+ replace Your obligations under this Public License where the Licensed
736
+ Rights include other Copyright and Similar Rights.
737
+
738
+
739
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
740
+
741
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
742
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
743
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
744
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
745
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
746
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
747
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
748
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
749
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
750
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
751
+
752
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
753
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
754
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
755
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
756
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
757
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
758
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
759
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
760
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
761
+
762
+ c. The disclaimer of warranties and limitation of liability provided
763
+ above shall be interpreted in a manner that, to the extent
764
+ possible, most closely approximates an absolute disclaimer and
765
+ waiver of all liability.
766
+
767
+
768
+ Section 6 -- Term and Termination.
769
+
770
+ a. This Public License applies for the term of the Copyright and
771
+ Similar Rights licensed here. However, if You fail to comply with
772
+ this Public License, then Your rights under this Public License
773
+ terminate automatically.
774
+
775
+ b. Where Your right to use the Licensed Material has terminated under
776
+ Section 6(a), it reinstates:
777
+
778
+ 1. automatically as of the date the violation is cured, provided
779
+ it is cured within 30 days of Your discovery of the
780
+ violation; or
781
+
782
+ 2. upon express reinstatement by the Licensor.
783
+
784
+ For the avoidance of doubt, this Section 6(b) does not affect any
785
+ right the Licensor may have to seek remedies for Your violations
786
+ of this Public License.
787
+
788
+ c. For the avoidance of doubt, the Licensor may also offer the
789
+ Licensed Material under separate terms or conditions or stop
790
+ distributing the Licensed Material at any time; however, doing so
791
+ will not terminate this Public License.
792
+
793
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
794
+ License.
795
+
796
+
797
+ Section 7 -- Other Terms and Conditions.
798
+
799
+ a. The Licensor shall not be bound by any additional or different
800
+ terms or conditions communicated by You unless expressly agreed.
801
+
802
+ b. Any arrangements, understandings, or agreements regarding the
803
+ Licensed Material not stated herein are separate from and
804
+ independent of the terms and conditions of this Public License.
805
+
806
+
807
+ Section 8 -- Interpretation.
808
+
809
+ a. For the avoidance of doubt, this Public License does not, and
810
+ shall not be interpreted to, reduce, limit, restrict, or impose
811
+ conditions on any use of the Licensed Material that could lawfully
812
+ be made without permission under this Public License.
813
+
814
+ b. To the extent possible, if any provision of this Public License is
815
+ deemed unenforceable, it shall be automatically reformed to the
816
+ minimum extent necessary to make it enforceable. If the provision
817
+ cannot be reformed, it shall be severed from this Public License
818
+ without affecting the enforceability of the remaining terms and
819
+ conditions.
820
+
821
+ c. No term or condition of this Public License will be waived and no
822
+ failure to comply consented to unless expressly agreed to by the
823
+ Licensor.
824
+
825
+ d. Nothing in this Public License constitutes or may be interpreted
826
+ as a limitation upon, or waiver of, any privileges and immunities
827
+ that apply to the Licensor or You, including from the legal
828
+ processes of any jurisdiction or authority.
829
+
830
+ =======================================================================
831
+
832
+ Creative Commons is not a party to its public
833
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
834
+ its public licenses to material it publishes and in those instances
835
+ will be considered the “Licensor.” The text of the Creative Commons
836
+ public licenses is dedicated to the public domain under the CC0 Public
837
+ Domain Dedication. Except for the limited purpose of indicating that
838
+ material is shared under a Creative Commons public license or as
839
+ otherwise permitted by the Creative Commons policies published at
840
+ creativecommons.org/policies, Creative Commons does not authorize the
841
+ use of the trademark "Creative Commons" or any other trademark or logo
842
+ of Creative Commons without its prior written consent including,
843
+ without limitation, in connection with any unauthorized modifications
844
+ to any of its public licenses or any other arrangements,
845
+ understandings, or agreements concerning use of licensed material. For
846
+ the avoidance of doubt, this paragraph does not form part of the
847
+ public licenses.
848
+
849
+ Creative Commons may be contacted at creativecommons.org.
850
+
851
+
852
+
853
+ # Objectron License
854
+ https://github.com/google-research-datasets/Objectron
855
+ https://github.com/google-research-datasets/Objectron/blob/main/LICENSE
856
+
857
+
858
+ # Computational Use of Data Agreement v1.0
859
+
860
+ This is the Computational Use of Data Agreement, Version 1.0 (the “C-UDA”). Capitalized terms are defined in Section 5. Data Provider and you agree as follows:
861
+
862
+ 1. **Provision of the Data**
863
+
864
+ 1.1. You may use, modify, and distribute the Data made available to you by the Data Provider under this C-UDA for Computational Use if you follow the C-UDA's terms.
865
+
866
+ 1.2. Data Provider will not sue you or any Downstream Recipient for any claim arising out of the use, modification, or distribution of the Data provided you meet the terms of the C-UDA.
867
+
868
+ 1.3 This C-UDA does not restrict your use, modification, or distribution of any portions of the Data that are in the public domain or that may be used, modified, or distributed under any other legal exception or limitation.
869
+
870
+ 2. **Restrictions**
871
+
872
+ 2.1 You agree that you will use the Data solely for Computational Use.
873
+
874
+ 2.2 The C-UDA does not impose any restriction with respect to the use, modification, or distribution of Results.
875
+
876
+ 3. **Redistribution of Data**
877
+
878
+ 3.1. You may redistribute the Data, so long as:
879
+
880
+ 3.1.1. You include with any Data you redistribute all credit or attribution information that you received with the Data, and your terms require any Downstream Recipient to do the same; and
881
+
882
+ 3.1.2. You bind each recipient to whom you redistribute the Data to the terms of the C-UDA.
883
+
884
+ 4. **No Warranty, Limitation of Liability**
885
+
886
+ 4.1. Data Provider does not represent or warrant that it has any rights whatsoever in the Data.
887
+
888
+ 4.2. THE DATA IS PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
889
+
890
+ 4.3. NEITHER DATA PROVIDER NOR ANY UPSTREAM DATA PROVIDER SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE DATA OR RESULTS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
891
+
892
+ 5. **Definitions**
893
+
894
+ 5.1. “Computational Use” means activities necessary to enable the use of Data (alone or along with other material) for analysis by a computer.
895
+
896
+ 5.2.“Data” means the material you receive under the C-UDA in modified or unmodified form, but not including Results.
897
+
898
+ 5.3. “Data Provider” means the source from which you receive the Data and with whom you enter into the C-UDA.
899
+
900
+ 5.4. “Downstream Recipient” means any person or persons who receives the Data directly or indirectly from you in accordance with the C-UDA.
901
+
902
+ 5.5. “Result” means anything that you develop or improve from your use of Data that does not include more than a de minimis portion of the Data on which the use is based. Results may include de minimis portions of the Data necessary to report on or explain use that has been conducted with the Data, such as figures in scientific papers, but do not include more. Artificial intelligence models trained on Data (and which do not include more than a de minimis portion of Data) are Results.
903
+
904
+ 5.6. “Upstream Data Providers” means the source or sources from which the Data Provider directly or indirectly received, under the terms of the C-UDA, material that is included in the Data.
905
+
906
+
MODEL_ZOO.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cube R-CNN Model Zoo on Omni3D
2
+
3
+ ## Models
4
+
5
+ We provide a model zoo for models trained on Omni3D data splits (see paper for more details).
6
+
7
+ | | Omni3D | Omni3D (Indoor only) | Omni3D (Outdoor only) |
8
+ |---------|:-------------------------:|:----------------------------:|:----------------------------:|
9
+ | `res34` | [omni3d/cubercnn_Res34_FPN.pth][res34_omni] | [indoor/cubercnn_Res34_FPN.pth][res34_in] | [outdoor/cubercnn_Res34_FPN.pth][res34_out] |
10
+ | `dla34` | [omni3d/cubercnn_DLA34_FPN.pth][dla34_omni] | [indoor/cubercnn_DLA34_FPN.pth][dla34_in] | [outdoor/cubercnn_DLA34_FPN.pth][dla34_out] |
11
+
12
+ [dla34_omni]: https://dl.fbaipublicfiles.com/cubercnn/omni3d/cubercnn_DLA34_FPN.pth
13
+ [dla34_in]: https://dl.fbaipublicfiles.com/cubercnn/indoor/cubercnn_DLA34_FPN.pth
14
+ [dla34_out]: https://dl.fbaipublicfiles.com/cubercnn/outdoor/cubercnn_DLA34_FPN.pth
15
+ [res34_omni]: https://dl.fbaipublicfiles.com/cubercnn/omni3d/cubercnn_Res34_FPN.pth
16
+ [res34_in]: https://dl.fbaipublicfiles.com/cubercnn/indoor/cubercnn_Res34_FPN.pth
17
+ [res34_out]: https://dl.fbaipublicfiles.com/cubercnn/outdoor/cubercnn_Res34_FPN.pth
ProposalNetwork/utils/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .spaces import *
2
+ from .conversions import *
3
+ from .utils import *
ProposalNetwork/utils/conversions.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from detectron2.structures import Boxes
4
+
5
+ def cube_to_box(cube,K):
6
+ '''
7
+ Converts a Cube to a Box.
8
+
9
+ Args:
10
+ cube: A Cube.
11
+ K: The 3D camera matrix of the box.
12
+
13
+ Returns:
14
+ A Box.
15
+ '''
16
+ bube_corners = cube.get_bube_corners(K)
17
+
18
+ min_x = torch.min(bube_corners[:,0])
19
+ max_x = torch.max(bube_corners[:,0])
20
+ min_y = torch.min(bube_corners[:,1])
21
+ max_y = torch.max(bube_corners[:,1])
22
+
23
+ return Boxes(torch.tensor([[min_x, min_y, max_x, max_y]], device=cube.tensor.device))
24
+
25
+ def cubes_to_box(cubes, K, im_shape):
26
+ '''
27
+ Converts a Cubes to a Boxes.
28
+
29
+ Args:
30
+ cubes: A Cubes.
31
+ K: The 3D camera matrix of the box.
32
+ im_shape: The shape of the image (width, height).
33
+
34
+ Returns:
35
+ A Box.
36
+ '''
37
+ bube_corners = cubes.get_bube_corners(K, im_shape)
38
+ min_x, _ = torch.min(bube_corners[:, :, :, 0], 2)
39
+ max_x, _ = torch.max(bube_corners[:, :, :, 0], 2)
40
+ min_y, _ = torch.min(bube_corners[:, :, :, 1], 2)
41
+ max_y, _ = torch.max(bube_corners[:, :, :, 1], 2)
42
+
43
+ values = torch.stack((min_x, min_y, max_x, max_y),dim=2)
44
+ box_list = []
45
+ for i in range(cubes.num_instances):
46
+ box_list.append(Boxes(values[i]))
47
+
48
+ return box_list
49
+
50
+
ProposalNetwork/utils/plane.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import torch
3
+ import numpy as np
4
+
5
+ class Plane_torch:
6
+ """
7
+ Implementation of planar RANSAC.
8
+
9
+ Class for Plane object, which finds the equation of a infinite plane using RANSAC algorithim.
10
+
11
+ Call `fit(.)` to randomly take 3 points of pointcloud to verify inliers based on a threshold.
12
+
13
+ ![Plane](https://raw.githubusercontent.com/leomariga/pyRANSAC-3D/master/doc/plano.gif "Plane")
14
+
15
+ ---
16
+ """
17
+
18
+ def __init__(self):
19
+ self.inliers = []
20
+ self.equation = []
21
+
22
+ def fit(self, pts, thresh=0.05, minPoints=100, maxIteration=1000):
23
+ """
24
+ Find the best equation for a plane.
25
+
26
+ :param pts: 3D point cloud as a `torch.Tensor (N,3)`.
27
+ :param thresh: Threshold distance from the plane which is considered inlier.
28
+ :param maxIteration: Number of maximum iteration which RANSAC will loop over.
29
+ :returns:
30
+ - `self.equation`: Parameters of the plane using Ax+By+Cy+D `torch.Tensor(4)`
31
+ - `self.inliers`: points from the dataset considered inliers
32
+
33
+ ---
34
+ """
35
+ n_points = pts.shape[0]
36
+ best_eq = []
37
+ best_inliers = []
38
+
39
+ for it in range(maxIteration):
40
+
41
+ # Samples 3 random points
42
+ id_samples = torch.randperm(n_points)[:3]
43
+ pt_samples = pts[id_samples]
44
+
45
+ # We have to find the plane equation described by those 3 points
46
+ # We find first 2 vectors that are part of this plane
47
+ # A = pt2 - pt1
48
+ # B = pt3 - pt1
49
+
50
+ vecA = pt_samples[1, :] - pt_samples[0, :]
51
+ vecB = pt_samples[2, :] - pt_samples[0, :]
52
+
53
+ # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane
54
+ vecC = torch.cross(vecA, vecB)
55
+
56
+ # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k
57
+ # We have to use a point to find k
58
+ vecC = vecC / torch.norm(vecC, p=2)
59
+ k = -torch.sum(torch.mul(vecC, pt_samples[1, :]))
60
+ plane_eq = torch.tensor([vecC[0], vecC[1], vecC[2], k])
61
+
62
+ # Distance from a point to a plane
63
+ # https://mathworld.wolfram.com/Point-PlaneDistance.html
64
+ pt_id_inliers = [] # list of inliers ids
65
+ dist_pt = (
66
+ plane_eq[0] * pts[:, 0] + plane_eq[1] * pts[:, 1] + plane_eq[2] * pts[:, 2] + plane_eq[3]
67
+ ) / torch.sqrt(plane_eq[0] ** 2 + plane_eq[1] ** 2 + plane_eq[2] ** 2)
68
+
69
+ # Select indexes where distance is smaller than the threshold
70
+ pt_id_inliers = torch.where(torch.abs(dist_pt) <= thresh)[0]
71
+ if len(pt_id_inliers) > len(best_inliers):
72
+ best_eq = plane_eq
73
+ best_inliers = pt_id_inliers
74
+ self.inliers = best_inliers
75
+ self.equation = best_eq
76
+
77
+ return -self.equation, self.inliers
78
+
79
+ def fit_parallel(self, pts:torch.Tensor, thresh=0.05, minPoints=100, maxIteration=1000):
80
+ """
81
+ Find the best equation for a plane.
82
+
83
+ :param pts: 3D point cloud as a `torch.Tensor (N,3)`.
84
+ :param thresh: Threshold distance from the plane which is considered inlier.
85
+ :param maxIteration: Number of maximum iteration which RANSAC will loop over.
86
+ :returns:
87
+ - `self.equation`: Parameters of the plane using Ax+By+Cy+D `torch.Tensor(4)`
88
+ - `self.inliers`: points from the dataset considered inliers
89
+
90
+ ---
91
+ """
92
+ n_points = pts.shape[0]
93
+
94
+ # Samples shape (maxIteration, 3) random points
95
+ id_samples = torch.tensor([random.sample(range(0, n_points), 3) for _ in range(maxIteration)],device=pts.device)
96
+ pt_samples = pts[id_samples]
97
+
98
+ # We have to find the plane equation described by those 3 points
99
+ # We find first 2 vectors that are part of this plane
100
+ # A = pt2 - pt1
101
+ # B = pt3 - pt1
102
+
103
+ vecA = pt_samples[:, 1, :] - pt_samples[:, 0, :]
104
+ vecB = pt_samples[:, 2, :] - pt_samples[:, 0, :]
105
+
106
+ # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane
107
+ vecC = torch.cross(vecA, vecB, dim=-1)
108
+
109
+ # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k
110
+ # We have to use a point to find k
111
+ vecC = vecC / torch.norm(vecC, p=2, dim=1, keepdim=True)
112
+ k = -torch.sum(torch.mul(vecC, pt_samples[:, 1, :]), dim=1)
113
+ plane_eqs = torch.column_stack([vecC[:, 0], vecC[:, 1], vecC[:, 2], k])
114
+
115
+ # Distance from a point to a plane
116
+ # https://mathworld.wolfram.com/Point-PlaneDistance.html
117
+ dist_pt = (
118
+ plane_eqs[:,0].unsqueeze(1) * pts[:, 0] + plane_eqs[:,1].unsqueeze(1) * pts[:, 1] + plane_eqs[:,2].unsqueeze(1) * pts[:, 2] + plane_eqs[:,3].unsqueeze(1)
119
+ ) / torch.sqrt(plane_eqs[:,0] ** 2 + plane_eqs[:,1] ** 2 + plane_eqs[:,2] ** 2).unsqueeze(1)
120
+
121
+ # Select indexes where distance is smaller than the threshold
122
+ # maxIteration x n_points
123
+ # row with most inliers
124
+
125
+ pt_id_inliers = torch.abs(dist_pt) <= thresh
126
+ counts = torch.sum(pt_id_inliers, dim=1)
127
+
128
+ best_eq = plane_eqs[torch.argmax(counts)]
129
+ best_inliers_id = pt_id_inliers[torch.argmax(counts)]
130
+ # convert boolean tensor to indices
131
+ best_inliers = torch.where(best_inliers_id)[0]
132
+ self.inliers = best_inliers
133
+ self.equation = best_eq
134
+ return -self.equation, self.inliers
135
+
136
+
137
+ class Plane_np:
138
+ """
139
+ Implementation of planar RANSAC.
140
+
141
+ Class for Plane object, which finds the equation of a infinite plane using RANSAC algorithim.
142
+
143
+ Call `fit(.)` to randomly take 3 points of pointcloud to verify inliers based on a threshold.
144
+
145
+ ![Plane](https://raw.githubusercontent.com/leomariga/pyRANSAC-3D/master/doc/plano.gif "Plane")
146
+
147
+ ---
148
+ """
149
+
150
+ def __init__(self):
151
+ self.inliers = []
152
+ self.equation = []
153
+
154
+ def fit(self, pts, thresh=0.05, minPoints=100, maxIteration=1000):
155
+ """
156
+ Find the best equation for a plane.
157
+
158
+ :param pts: 3D point cloud as a `np.array (N,3)`.
159
+ :param thresh: Threshold distance from the plane which is considered inlier.
160
+ :param maxIteration: Number of maximum iteration which RANSAC will loop over.
161
+ :returns:
162
+ - `self.equation`: Parameters of the plane using Ax+By+Cy+D `np.array (1, 4)`
163
+ - `self.inliers`: points from the dataset considered inliers
164
+
165
+ ---
166
+ """
167
+ n_points = pts.shape[0]
168
+ best_eq = []
169
+ best_inliers = []
170
+
171
+ for it in range(maxIteration):
172
+
173
+ # Samples 3 random points
174
+ id_samples = random.sample(range(0, n_points), 3)
175
+ pt_samples = pts[id_samples]
176
+
177
+ # We have to find the plane equation described by those 3 points
178
+ # We find first 2 vectors that are part of this plane
179
+ # A = pt2 - pt1
180
+ # B = pt3 - pt1
181
+
182
+ vecA = pt_samples[1, :] - pt_samples[0, :]
183
+ vecB = pt_samples[2, :] - pt_samples[0, :]
184
+
185
+ # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane
186
+ vecC = np.cross(vecA, vecB)
187
+
188
+ # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k
189
+ # We have to use a point to find k
190
+ vecC = vecC / np.linalg.norm(vecC)
191
+ k = -np.sum(np.multiply(vecC, pt_samples[1, :]))
192
+ plane_eq = [vecC[0], vecC[1], vecC[2], k]
193
+
194
+ # Distance from a point to a plane
195
+ # https://mathworld.wolfram.com/Point-PlaneDistance.html
196
+ pt_id_inliers = [] # list of inliers ids
197
+ dist_pt = (
198
+ plane_eq[0] * pts[:, 0] + plane_eq[1] * pts[:, 1] + plane_eq[2] * pts[:, 2] + plane_eq[3]
199
+ ) / np.sqrt(plane_eq[0] ** 2 + plane_eq[1] ** 2 + plane_eq[2] ** 2)
200
+
201
+ # Select indexes where distance is biggers than the threshold
202
+ pt_id_inliers = np.where(np.abs(dist_pt) <= thresh)[0]
203
+ if len(pt_id_inliers) > len(best_inliers):
204
+ best_eq = plane_eq
205
+ best_inliers = pt_id_inliers
206
+ self.inliers = best_inliers
207
+ self.equation = best_eq
208
+
209
+ return self.equation, self.inliers
ProposalNetwork/utils/spaces.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from cubercnn import util
4
+
5
+ '''
6
+ coordinate system is assumed to have origin in the upper left
7
+ (0,0) _________________(N,0)
8
+ |
9
+ |
10
+ |
11
+ |
12
+ |
13
+ (0,M)
14
+ '''
15
+ """
16
+ class Cube:
17
+ '''
18
+ 3D box in the format [c1, c2, c3, w, h, l, R]
19
+
20
+ Args:
21
+ c1: The x coordinate of the center of the box.
22
+ c2: The y coordinate of the center of the box.
23
+ c3: The z coordinate of the center of the box.
24
+ w: The width of the box in meters.
25
+ h: The height of the box in meters.
26
+ l: The length of the box in meters.
27
+ R: The 3D rotation matrix of the box.
28
+ ```
29
+
30
+ _____________________
31
+ /| /|
32
+ / | / |
33
+ / | / |
34
+ /___|_________________/ |
35
+ | | | | h
36
+ | | | |
37
+ | | | |
38
+ | | (c1,c2,c3) | |
39
+ | |_________________|___|
40
+ | / | /
41
+ | / | /
42
+ | / | / l
43
+ |/_____________________|/
44
+ w
45
+ ```
46
+ '''
47
+ def __init__(self,tensor: torch.Tensor, R: torch.Tensor, score=None, label=None) -> None:
48
+ self.tensor = tensor
49
+ self.center = tensor[:3]
50
+ self.dimensions = tensor[3:6]
51
+ self.rotation = R
52
+
53
+ # score and label are meant as auxiliary information
54
+ self.score = score
55
+ self.label = label
56
+
57
+ def get_cube(self):
58
+ color = [c/255.0 for c in util.get_color()]
59
+ return util.mesh_cuboid(torch.cat((self.center,self.dimensions)), self.rotation, color=color)
60
+
61
+ def get_all_corners(self):
62
+ '''wrap ``util.get_cuboid_verts_faces``
63
+
64
+ Returns:
65
+ verts: the 3D vertices of the cuboid in camera space'''
66
+ verts, _ = util.get_cuboid_verts_faces(torch.cat((self.center,self.dimensions)), self.rotation)
67
+ return verts
68
+
69
+ def get_bube_corners(self,K) -> torch.Tensor:
70
+ cube_corners = self.get_all_corners()
71
+ cube_corners = torch.mm(K, cube_corners.t()).t()
72
+ return cube_corners[:,:2]/cube_corners[:,2].unsqueeze(1)
73
+
74
+ def get_volume(self) -> float:
75
+ return self.dimensions.prod().item()
76
+
77
+
78
+ def __repr__(self) -> str:
79
+ return f'Cube({self.center}, {self.dimensions}, {self.rotation})'
80
+
81
+ def to_device(self, device):
82
+ '''
83
+ Move all tensors of the instantiated class to the specified device.
84
+
85
+ Args:
86
+ device: The device to move the tensors to (e.g., 'cuda', 'cpu').
87
+ '''
88
+ self.tensor = self.tensor.to(device)
89
+ self.center = self.center.to(device)
90
+ self.dimensions = self.dimensions.to(device)
91
+ self.rotation = self.rotation.to(device)
92
+ return self
93
+ """
94
+
95
+ class Cubes:
96
+ '''
97
+ 3D boxes in the format [[c1, c2, c3, w, h, l, R1...R9]]
98
+
99
+ inspired by `detectron2.structures.Boxes`
100
+
101
+ Args:
102
+ tensor: torch.tensor(
103
+ c1: The x coordinates of the center of the boxes.
104
+ c2: The y coordinates of the center of the boxes.
105
+ c3: The z coordinates of the center of the boxes.
106
+ w: The width of the boxes in meters.
107
+ h: The height of the boxes in meters.
108
+ l: The length of the boxes in meters.
109
+ R: The flattened 3D rotation matrix of the boxes (i.e. the rows are next to each other).
110
+ )
111
+ of shape (N, 15).
112
+ ```
113
+ _____________________
114
+ /| /|
115
+ / | / |
116
+ / | / |
117
+ /___|_________________/ |
118
+ | | | | h
119
+ | | | |
120
+ | | | |
121
+ | | (c1,c2,c3) | |
122
+ | |_________________|___|
123
+ | / | /
124
+ | / | /
125
+ | / | / l
126
+ |/_____________________|/
127
+ w
128
+ ```
129
+ '''
130
+ def __init__(self,tensor: torch.Tensor, scores=None, labels=None) -> None:
131
+
132
+ # score and label are meant as auxiliary information
133
+ if scores is not None:
134
+ assert scores.ndim == 2, f"scores.shape must be (n_instances, n_proposals), but was {scores.shape}"
135
+ self.scores = scores
136
+ self.labels = labels
137
+
138
+ if not isinstance(tensor, torch.Tensor):
139
+ if not isinstance(tensor, np.ndarray):
140
+ tensor = np.asarray(tensor)
141
+ tensor = torch.as_tensor(tensor, dtype=torch.float32, device=torch.device("cpu"))
142
+ else:
143
+ tensor = tensor.to(torch.float32)
144
+ if tensor.numel() == 0:
145
+ tensor = tensor.reshape((-1, 15)).to(dtype=torch.float32)
146
+ self.tensor = tensor
147
+ if self.tensor.dim() == 1:
148
+ self.tensor = self.tensor.unsqueeze(0)
149
+ if self.tensor.dim() == 2:
150
+ self.tensor = self.tensor.unsqueeze(0)
151
+
152
+ @property
153
+ def centers(self):
154
+ return self.tensor[:, :, :3]
155
+
156
+ @property
157
+ def dimensions(self):
158
+ return self.tensor[:, :, 3:6]
159
+
160
+ @property
161
+ def rotations(self):
162
+ shape = self.tensor.shape
163
+ return self.tensor[:, :, 6:].reshape(shape[0],shape[1], 3, 3)
164
+
165
+ @property
166
+ def device(self):
167
+ return self.tensor.device
168
+
169
+ @property
170
+ def num_instances(self):
171
+ return self.tensor.shape[0]
172
+
173
+ @property
174
+ def shape(self):
175
+ return self.tensor.shape
176
+
177
+ def clone(self) -> "Cubes":
178
+ """
179
+ Clone the Cubes.
180
+
181
+ Returns:
182
+ Cubes
183
+ """
184
+ return Cubes(self.tensor.clone())
185
+
186
+
187
+ def get_cubes(self):
188
+ color = [c/255.0 for c in util.get_color()]
189
+ return util.mesh_cuboid(torch.cat((self.centers.squeeze(0),self.dimensions.squeeze(0)),dim=1), self.rotations.squeeze(0), color=color)
190
+
191
+
192
+ def get_all_corners(self):
193
+ '''wrap ``util.get_cuboid_verts_faces``
194
+
195
+ Returns:
196
+ verts: the 3D vertices of the cuboid in camera space'''
197
+
198
+ verts_list = []
199
+ for i in range(self.num_instances):
200
+ verts_next_instance, _ = util.get_cuboid_verts_faces(self.tensor[i, :, :6], self.rotations[i])
201
+ verts_list.append(verts_next_instance)
202
+ verts = torch.stack(verts_list, dim=0)
203
+
204
+ return verts
205
+
206
+ def get_cuboids_verts_faces(self):
207
+ '''wrap ``util.get_cuboid_verts_faces``
208
+
209
+ Returns:
210
+ verts: the 3D vertices of the cuboid in camera space
211
+ faces: the faces of the cuboid in camera space'''
212
+
213
+ verts_list = []
214
+ faces_list = []
215
+ for i in range(self.num_instances):
216
+ verts_next_instance, faces = util.get_cuboid_verts_faces(self.tensor[i, :, :6], self.rotations[i])
217
+ verts_list.append(verts_next_instance)
218
+ faces_list.append(faces)
219
+ verts = torch.stack(verts_list, dim=0)
220
+ faces = torch.stack(faces_list, dim=0)
221
+
222
+ return verts, faces
223
+
224
+ def get_bube_corners(self, K, clamp:tuple=None) -> torch.Tensor:
225
+ '''This assumes that all the cubes have the same camera intrinsic matrix K
226
+
227
+ clamp is a typically the image shape (width, height) to truncate the boxes to image frame, this avoids huge projected boxes
228
+ Returns:
229
+ num_instances x N x 8 x 2'''
230
+ cube_corners = self.get_all_corners() # num_instances x N x 8 x 3
231
+ num_prop = cube_corners.shape[1]
232
+ cube_corners = cube_corners.reshape(self.num_instances * num_prop, 8, 3)
233
+ K_repeated = K.repeat(self.num_instances * num_prop,1,1)
234
+ cube_corners = torch.matmul(K_repeated, cube_corners.transpose(2,1))
235
+ cube_corners = cube_corners[:, :2, :]/cube_corners[:, 2, :].unsqueeze(-2)
236
+ cube_corners = cube_corners.transpose(2,1)
237
+ cube_corners = cube_corners.reshape(self.num_instances, num_prop, 8, 2)
238
+
239
+ # we must clamp and then stack, otherwise the gradient is fucked
240
+ if clamp is not None:
241
+ x = torch.clamp(cube_corners[..., 0], int(-clamp[0]/2+1), int(clamp[0]-1+clamp[0]))
242
+ y = torch.clamp(cube_corners[..., 1], int(-clamp[1]/2+1), int(clamp[1]-1+clamp[1]))
243
+ cube_corners = torch.stack((x, y), dim=-1)
244
+
245
+ return cube_corners # num_instances x num_proposals x 8 x 2
246
+
247
+ def get_volumes(self) -> float:
248
+ return self.get_dimensions().prod(1).item()
249
+
250
+ def __len__(self) -> int:
251
+ return self.tensor.shape[0]
252
+
253
+ def __repr__(self) -> str:
254
+ return f'Cubes({self.tensor})'
255
+
256
+ def to(self, device: torch.device):
257
+ # Cubes are assumed float32 and does not support to(dtype)
258
+ if isinstance(self.scores, torch.Tensor):
259
+ self.scores = self.scores.to(device=device)
260
+ if isinstance(self.labels, torch.Tensor):
261
+ self.labels = self.labels.to(device=device)
262
+ return Cubes(self.tensor.to(device=device), self.scores, self.labels)
263
+
264
+ def __getitem__(self, item) -> "Cubes":
265
+ """
266
+ Args:
267
+ item: int, slice, or a BoolTensor
268
+
269
+ Returns:
270
+ Cubes: Create a new :class:`Cubes` by indexing.
271
+
272
+ The following usage are allowed:
273
+
274
+ 1. `new_cubes = cubes[3]`: return a `Cubes` which contains only one box.
275
+ 2. `new_cubes = cubes[2:10]`: return a slice of cubes.
276
+ 3. `new_cubes = cubes[vector]`, where vector is a torch.BoolTensor
277
+ with `length = len(cubes)`. Nonzero elements in the vector will be selected.
278
+
279
+ Note that the returned Cubes might share storage with this Cubes,
280
+ subject to Pytorch's indexing semantics.
281
+ """
282
+ if isinstance(item, int):
283
+ prev_n_prop = self.tensor.shape[1]
284
+ return Cubes(self.tensor[item].view(1, prev_n_prop, -1))
285
+ elif isinstance(item, tuple):
286
+ return Cubes(self.tensor[item[0],item[1]].view(1, 1, -1))
287
+ b = self.tensor[item]
288
+ assert b.dim() == 2, "Indexing on Cubes with {} failed to return a matrix!".format(item)
289
+ return Cubes(b)
290
+
291
+
292
+ @classmethod
293
+ def cat(cls, cubes_list: list["Cubes"]) -> "Cubes":
294
+ """
295
+ Concatenates a list of Cubes into a single Cubes
296
+
297
+ Arguments:
298
+ cubes_list (list[Cubes])
299
+
300
+ Returns:
301
+ Cubes: the concatenated Cubes
302
+ """
303
+ assert isinstance(cubes_list, (list, tuple))
304
+ if len(cubes_list) == 0:
305
+ return cls(torch.empty(0))
306
+ assert all([isinstance(box, Cubes) for box in cubes_list])
307
+
308
+ # use torch.cat (v.s. layers.cat) so the returned cubes never share storage with input
309
+ cat_cubes = cls(torch.cat([b.tensor for b in cubes_list], dim=0))
310
+ return cat_cubes
311
+
312
+ @torch.jit.unused
313
+ def __iter__(self):
314
+ """
315
+ Yield a cube as a Tensor of shape (15,) at a time.
316
+ """
317
+ yield from self.tensor
318
+
319
+ def split(self, split_size: int, dim=1) -> tuple["Cubes"]:
320
+ """same behaviour as torch.split, return a tuple of chunksize Cubes"""
321
+ return tuple(Cubes(x) for x in self.tensor.split(split_size, dim=dim))
322
+
323
+ def reshape(self, *args) -> "Cubes":
324
+ """
325
+ Returns:
326
+ Cubes: reshaped Cubes
327
+ """
328
+ return Cubes(self.tensor.reshape(*args), self.scores, self.labels)
ProposalNetwork/utils/utils.py ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+
5
+ from detectron2.structures import pairwise_iou
6
+ from pytorch3d.ops import box3d_overlap
7
+
8
+ ##### Proposal
9
+ def normalize_vector(v):
10
+ v_mag = torch.sqrt(v.pow(2).sum())
11
+ v_mag = torch.max(v_mag, torch.tensor([1e-8], device=v.device))
12
+ v_mag = v_mag.view(1,1).expand(1,v.shape[0])
13
+ v = v/v_mag
14
+
15
+ return v[0]
16
+
17
+ def cross_product(u, v):
18
+ i = u[1]*v[2] - u[2]*v[1]
19
+ j = u[2]*v[0] - u[0]*v[2]
20
+ k = u[0]*v[1] - u[1]*v[0]
21
+ out = torch.cat((i.view(1,1), j.view(1,1), k.view(1,1)),1)
22
+
23
+ return out[0]
24
+
25
+ def compute_rotation_matrix_from_ortho6d(poses):
26
+ x_raw = poses[0:3]
27
+ y_raw = poses[3:6]
28
+
29
+ x = normalize_vector(x_raw)
30
+ z = cross_product(x,y_raw)
31
+ z = normalize_vector(z)
32
+ y = cross_product(z,x)
33
+
34
+ x = x.view(-1,3,1)
35
+ y = y.view(-1,3,1)
36
+ z = z.view(-1,3,1)
37
+ matrix = torch.cat((x,y,z), 2)[0]
38
+
39
+ return matrix
40
+
41
+ def sample_normal_in_range(means, stds, count, threshold_low=None, threshold_high=None):
42
+ device = means.device
43
+ # Generate samples from a normal distribution
44
+ samples = torch.normal(means.unsqueeze(1).expand(-1,count), stds.unsqueeze(1).expand(-1,count))
45
+
46
+ # Ensure that all samples are greater than threshold_low and less than threshold_high
47
+ if threshold_high is not None and threshold_low is not None:
48
+ tries = 0
49
+ threshold_high = threshold_high.unsqueeze(1).expand_as(samples)
50
+ while torch.any((samples < threshold_low) | (samples > threshold_high)):
51
+ invalid_mask = (samples < threshold_low) | (samples > threshold_high)
52
+ # Replace invalid samples with new samples drawn from the normal distribution, could be done more optimal by sampling only sum(invalid mask) new samples, but matching of correct means is difficult then
53
+ samples[invalid_mask] = torch.normal(means.unsqueeze(1).expand(-1,count), stds.unsqueeze(1).expand(-1,count))[invalid_mask]
54
+
55
+ tries += 1
56
+ if tries == 10000:
57
+ break
58
+
59
+ return samples.to(device)
60
+
61
+ def randn_orthobasis_torch(num_samples=1,num_instances=1):
62
+ z = torch.randn(num_instances, num_samples, 3, 3)
63
+ z = z / torch.norm(z, p=2, dim=-1, keepdim=True)
64
+ z[:, :, 0] = torch.cross(z[:, :, 1], z[:, :, 2], dim=-1)
65
+ z[:, :, 0] = z[:, :, 0] / torch.norm(z[:, :, 0], dim=-1, keepdim=True)
66
+ z[:, :, 1] = torch.cross(z[:, :, 2], z[:, :, 0], dim=-1)
67
+ z[:, :, 1] = z[:, :, 1] / torch.norm(z[:, :, 1], dim=-1, keepdim=True)
68
+ return z
69
+
70
+ def randn_orthobasis(num_samples=1):
71
+ z = np.random.randn(num_samples, 3, 3)
72
+ z = z / np.linalg.norm(z, axis=-1, keepdims=True)
73
+ z[:, 0] = np.cross(z[:, 1], z[:, 2], axis=-1)
74
+ z[:, 0] = z[:, 0] / np.linalg.norm(z[:, 0], axis=-1, keepdims=True)
75
+ z[:, 1] = np.cross(z[:, 2], z[:, 0], axis=-1)
76
+ z[:, 1] = z[:, 1] / np.linalg.norm(z[:, 1], axis=-1, keepdims=True)
77
+ return z
78
+
79
+ # ##things for making rotations
80
+ def vec_perp(vec):
81
+ '''generate a vector perpendicular to vec in 3d'''
82
+ # https://math.stackexchange.com/a/2450825
83
+ a, b, c = vec
84
+ if a == 0:
85
+ return np.array([0,c,-b])
86
+ return np.array(normalize_vector(torch.tensor([b,-a,0])))
87
+
88
+ def orthobasis_from_normal(normal, yaw_angle=0):
89
+ '''generate an orthonormal/Rotation matrix basis from a normal vector in 3d
90
+
91
+ returns a 3x3 matrix with the basis vectors as columns, 3rd column is the original normal vector
92
+ '''
93
+ x = rotate_vector(vec_perp(normal), normal, yaw_angle)
94
+ x = x / np.linalg.norm(x, ord=2)
95
+ y = np.cross(normal, x)
96
+ return np.array([x, normal, y]).T # the vectors should be as columns
97
+
98
+ def rotate_vector(v, k, theta):
99
+ '''rotate a vector v around an axis k by an angle theta
100
+ it is assumed that k is a unit vector (p2 norm = 1)'''
101
+ # https://medium.com/@sim30217/rodrigues-rotation-formula-47489db49050
102
+ cos_theta = np.cos(theta)
103
+ sin_theta = np.sin(theta)
104
+
105
+ term1 = v * cos_theta
106
+ term2 = np.cross(k, v) * sin_theta
107
+ term3 = k * np.dot(k, v) * (1 - cos_theta)
108
+
109
+ return term1 + term2 + term3
110
+
111
+ def vec_perp_t(vec):
112
+ '''generate a vector perpendicular to vec in 3d'''
113
+ # https://math.stackexchange.com/a/2450825
114
+ a, b, c = vec
115
+ if a == 0:
116
+ return torch.tensor([0,c,-b], device=vec.device)
117
+ return normalize_vector(torch.tensor([b,-a,0], device=vec.device))
118
+
119
+ def orthobasis_from_normal_t(normal:torch.Tensor, yaw_angles:torch.Tensor=0):
120
+ '''generate an orthonormal/Rotation matrix basis from a normal vector in 3d
121
+
122
+ normal is assumed to be normalised
123
+
124
+ returns a (no. of yaw_angles)x3x3 matrix with the basis vectors as columns, 3rd column is the original normal vector
125
+ '''
126
+ n = len(yaw_angles)
127
+ x = rotate_vector_t(vec_perp_t(normal), normal, yaw_angles)
128
+ # x = x / torch.norm(x, p=2)
129
+ y = torch.cross(normal.view(-1,1), x)
130
+ # y = y / torch.norm(y, p=2, dim=1)
131
+ return torch.cat([x.t(), normal.unsqueeze(0).repeat(n, 1), y.t()],dim=1).reshape(n,3,3).transpose(2,1) # the vectors should be as columns
132
+
133
+ def rotate_vector_t(v, k, theta):
134
+ '''rotate a vector v around an axis k by an angle theta
135
+ it is assumed that k is a unit vector (p2 norm = 1)'''
136
+ # https://medium.com/@sim30217/rodrigues-rotation-formula-47489db49050
137
+ cos_theta = torch.cos(theta)
138
+ sin_theta = torch.sin(theta)
139
+ v2 = v.view(-1,1)
140
+
141
+ term1 = v2 * cos_theta
142
+ term2 = torch.cross(k, v).view(-1, 1) * sin_theta
143
+ term3 = (k * (k @ v)).view(-1, 1) * (1 - cos_theta)
144
+
145
+ return (term1 + term2 + term3)
146
+
147
+ # ########### End rotations
148
+ def gt_in_norm_range(range,gt):
149
+ tmp = gt-range[0]
150
+ res = tmp / abs(range[1] - range[0])
151
+
152
+ return res
153
+
154
+ if range[0] > 0: # both positive
155
+ tmp = gt-range[0]
156
+ res = tmp / abs(range[1] - range[0])
157
+ elif range[1] > 0: # lower negative upper positive
158
+ if gt > 0:
159
+ tmp = gt-range[0]
160
+ else:
161
+ tmp = range[1]-gt
162
+ res = tmp / abs(range[1] - range[0])
163
+ else: # both negative
164
+ tmp = range[1]-gt
165
+ res = tmp / abs(range[1] - range[0])
166
+
167
+ return res
168
+
169
+ def vectorized_linspace(start_tensor, end_tensor, number_of_steps):
170
+ # Calculate spacing
171
+ spacing = (end_tensor - start_tensor) / (number_of_steps - 1)
172
+ # Create linear spaces with arange
173
+ linear_spaces = torch.arange(start=0, end=number_of_steps, dtype=start_tensor.dtype, device=start_tensor.device)
174
+ linear_spaces = linear_spaces.repeat(start_tensor.size(0),1)
175
+ linear_spaces = linear_spaces * spacing[:,None] + start_tensor[:,None]
176
+ return linear_spaces
177
+
178
+
179
+
180
+ ##### Scoring
181
+ def iou_2d(gt_box, proposal_boxes):
182
+ '''
183
+ gt_box: Boxes
184
+ proposal_box: Boxes
185
+ '''
186
+ IoU = pairwise_iou(gt_box,proposal_boxes).flatten()
187
+ return IoU
188
+
189
+ def iou_3d(gt_cube, proposal_cubes):
190
+ """
191
+ Compute the Intersection over Union (IoU) of two 3D cubes.
192
+
193
+ Parameters:
194
+ - gt_cube: GT Cube.
195
+ - proposal_cube: List of Proposal Cubes.
196
+
197
+ Returns:
198
+ - iou: Intersection over Union (IoU) value.
199
+ """
200
+ gt_corners = gt_cube.get_all_corners()[0]
201
+ proposal_corners = proposal_cubes.get_all_corners()[0]
202
+ vol, iou = box3d_overlap(gt_corners,proposal_corners)
203
+ iou = iou[0]
204
+
205
+ return iou
206
+
207
+ def custom_mapping(x,beta=1.7):
208
+ '''
209
+ maps the input curve to be S shaped instead of linear
210
+
211
+ Args:
212
+ beta: number > 1, higher beta is more aggressive
213
+ x: list of floats betweeen and including 0 and 1
214
+ beta: number > 1 higher beta is more aggressive
215
+ '''
216
+ mapped_list = []
217
+ for i in range(len(x)):
218
+ if x[i] <= 0:
219
+ mapped_list.append(0.0)
220
+ else:
221
+ mapped_list.append((1 / (1 + (x[i] / (1 - x[i])) ** (-beta))))
222
+
223
+ return mapped_list
224
+
225
+ def mask_iou(segmentation_mask, bube_mask):
226
+ '''
227
+ Area is of segmentation_mask
228
+ '''
229
+ bube_mask = torch.tensor(bube_mask, device=segmentation_mask.device)
230
+ intersection = (segmentation_mask * bube_mask).sum()
231
+ if intersection == 0:
232
+ return torch.tensor(0.0)
233
+ union = torch.logical_or(segmentation_mask, bube_mask).to(torch.int).sum()
234
+ return intersection / union
235
+
236
+ def mod_mask_iou(segmentation_mask, bube_mask):
237
+ '''
238
+ Area is of segmentation_mask
239
+ '''
240
+ bube_mask = torch.tensor(bube_mask, device=segmentation_mask.device)
241
+ intersection = (segmentation_mask * bube_mask).sum()
242
+ if intersection == 0:
243
+ return torch.tensor(0.0)
244
+ union = torch.logical_or(segmentation_mask, bube_mask).to(torch.int).sum()
245
+ return intersection**5 / union # NOTE not standard IoU
246
+
247
+ def mask_iou_loss(segmentation_mask, bube_mask):
248
+ '''
249
+ Area is of segmentation_mask
250
+ '''
251
+ intersection = (segmentation_mask * bube_mask).sum()
252
+ if intersection == 0:
253
+ return torch.tensor(0.0)
254
+ union = torch.logical_or(segmentation_mask, bube_mask).to(torch.int).sum()
255
+ return intersection / union
256
+
257
+ def is_gt_included(gt_cube,x_range,y_range,z_range, w_prior, h_prior, l_prior):
258
+ # Define how far away dimensions need to be to be counted as unachievable
259
+ stds_away = 1.5
260
+ # Center
261
+ because_of = []
262
+ if not (x_range[0] < gt_cube.center[0] < x_range[1]):
263
+ if (gt_cube.center[0] < x_range[0]):
264
+ val = abs(x_range[0] - gt_cube.center[0])
265
+ else:
266
+ val = abs(gt_cube.center[0] - x_range[1])
267
+ because_of.append(f'x by {val:.1f}')
268
+ if not (y_range[0] < gt_cube.center[1] < y_range[1]):
269
+ if (gt_cube.center[1] < y_range[0]):
270
+ val = abs(y_range[0] - gt_cube.center[1])
271
+ else:
272
+ val = abs(gt_cube.center[1] - y_range[1])
273
+ because_of.append(f'y by {val:.1f}')
274
+ # Depth
275
+ if not (z_range[0] < gt_cube.center[2] < z_range[1]):
276
+ if (gt_cube.center[2] < z_range[0]):
277
+ val = abs(z_range[0] - gt_cube.center[2])
278
+ else:
279
+ val = abs(gt_cube.center[2] - z_range[1])
280
+ because_of.append(f'z by {val:.1f}')
281
+ # Dimensions
282
+ if (gt_cube.dimensions[0] < w_prior[0]-stds_away*w_prior[1]):
283
+ because_of.append('w-')
284
+ if (gt_cube.dimensions[0] > w_prior[0]+stds_away*w_prior[1]):
285
+ because_of.append('w+')
286
+ if (gt_cube.dimensions[1] < h_prior[0]-stds_away*h_prior[1]):
287
+ because_of.append('h-')
288
+ if (gt_cube.dimensions[1] > h_prior[0]+stds_away*h_prior[1]):
289
+ because_of.append('h+')
290
+ if (gt_cube.dimensions[2] < l_prior[0]-stds_away*l_prior[1]):
291
+ because_of.append('l-')
292
+ if (gt_cube.dimensions[2] > l_prior[0]+stds_away*l_prior[1]):
293
+ because_of.append('l+')
294
+ if because_of == []:
295
+ return True
296
+ else:
297
+ print('GT cannot be found due to',because_of)
298
+ return False
299
+
300
+ # rotation nothing yet
301
+
302
+ def euler_to_unit_vector(eulers):
303
+ """
304
+ Convert Euler angles to a unit vector.
305
+ """
306
+ yaw, pitch, roll = eulers
307
+
308
+ # Calculate the components of the unit vector
309
+ x = np.cos(yaw) * np.cos(pitch)
310
+ y = np.sin(yaw) * np.cos(pitch)
311
+ z = np.sin(pitch)
312
+
313
+ # Normalize the vector
314
+ length = np.sqrt(x**2 + y**2 + z**2)
315
+ unit_vector = np.array([x, y, z]) / length
316
+
317
+ return unit_vector
318
+
319
+
320
+ # helper functions for plotting segmentation masks
321
+ def show_mask(mask, ax, random_color=False):
322
+ if random_color:
323
+ color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
324
+ else:
325
+ color = np.array([30/255, 144/255, 255/255, 0.6])
326
+ h, w = mask.shape[-2:]
327
+ mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
328
+ ax.imshow(mask_image)
329
+
330
+ def show_mask2(masks:np.array, im:np.array, random_color=False):
331
+ """
332
+ Display the masks on top of the image.
333
+
334
+ Args:
335
+ masks (np.array): Array of masks with shape (h, w, 4).
336
+ im (np.array): Image with shape (h, w, 3).
337
+ random_color (bool, optional): Whether to use random colors for the masks. Defaults to False.
338
+
339
+ Returns:
340
+ np.array: Image with masks displayed on top.
341
+ """
342
+ im_expanded = np.concatenate((im, np.ones((im.shape[0],im.shape[1],1))*255), axis=-1)/255
343
+
344
+ mask_image = np.zeros((im.shape[0],im.shape[1],4))
345
+ for i, mask in enumerate(masks):
346
+ if isinstance(random_color, list):
347
+ color = random_color[i]
348
+ else:
349
+ color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
350
+ h, w = mask.shape[-2:]
351
+ mask_sub = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
352
+ mask_image = mask_image + mask_sub
353
+ mask_binary = (mask_image > 0).astype(bool)
354
+ im_out = im_expanded * ~mask_binary + (0.5* mask_image + 0.5 * (im_expanded * mask_binary))
355
+ im_out = im_out.clip(0,1)
356
+ return im_out
357
+
358
+ def show_points(coords, labels, ax, marker_size=375):
359
+ pos_points = coords[labels==1]
360
+ neg_points = coords[labels==0]
361
+ ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
362
+ ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
363
+
364
+ def show_box(box, ax):
365
+ x0, y0 = box[0], box[1]
366
+ w, h = box[2] - box[0], box[3] - box[1]
367
+ ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
368
+
369
+
370
+
371
+
372
+
373
+
374
+ # Convex Hull
375
+ import torch
376
+
377
+ def direction(p1, p2, p3):
378
+ return (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0])
379
+
380
+ def distance_sq(p1, p2):
381
+ return (p2[0] - p1[0])**2 + (p2[1] - p1[1])**2
382
+
383
+ def findDuplicates(arr):
384
+ Len = len(arr)
385
+ ifPresent = False
386
+ a1 = []
387
+ idx = []
388
+ for i in range(Len - 1):
389
+ for j in range(i + 1, Len):
390
+ # Checking if element is present in the ArrayList or not if present then break
391
+ if torch.all(arr[i] == arr[j]):
392
+ # if len(a1) == 0:
393
+ # a1 arr[i]
394
+ # idx.append(i)
395
+ # ifPresent = True
396
+ # else:
397
+ # # if arr[i] in a1:
398
+ # # break
399
+ # # # If element is not present in the ArrayList then add it to ArrayList and make ifPresent true
400
+ # # else:
401
+ a1.append(arr[i])
402
+ idx.append(i)
403
+ ifPresent = True
404
+
405
+ if ifPresent:
406
+ return set(idx) # lazi inefficient implementation
407
+ else:
408
+ return None
409
+
410
+ def jarvis_march(points):
411
+ '''https://algorithmtutor.com/Computational-Geometry/Convex-Hull-Algorithms-Jarvis-s-March/
412
+ https://algorithmtutor.com/Computational-Geometry/Determining-if-two-consecutive-segments-turn-left-or-right/ '''
413
+ # remove duplicates
414
+ duplicates = findDuplicates(points)
415
+ # this is necessary if there are > 2 duplicates of the same element
416
+ if duplicates is not None:
417
+ plusone = torch.zeros_like(points)
418
+ for i, d in enumerate(duplicates):
419
+ plusone[d] += i + 1
420
+ points = points + plusone
421
+
422
+ # find the lower left point
423
+ min_x = torch.min(points[:, 0])
424
+ candidates = (points[:, 0] == min_x).nonzero(as_tuple=True)[0]
425
+
426
+ # If there are multiple points, choose the one with the highest y value
427
+ if len(candidates) > 1:
428
+ index = candidates[torch.argmax(points[candidates][:, 1])]
429
+ else:
430
+ index = candidates[0]
431
+
432
+ a = points[index]
433
+
434
+ # selection sort
435
+ l = index
436
+ result = []
437
+ result.append(a)
438
+
439
+ while (True):
440
+ q = (l + 1) % len(points)
441
+ for i in range(len(points)):
442
+ if i == l:
443
+ continue
444
+ # find the greatest left turn
445
+ # in case of collinearity, consider the farthest point
446
+ d = direction(points[l], points[i], points[q])
447
+ if d > 0 or (d == 0 and distance_sq(points[i], points[l]) > distance_sq(points[q], points[l])):
448
+ q = i
449
+ l = q
450
+ if l == index:
451
+ break
452
+ result.append(points[q])
453
+
454
+ return torch.flip(torch.stack(result), [0,])
455
+
456
+ def fill_polygon(mask, polygon):
457
+ '''
458
+ inspired by https://web.archive.org/web/20120323102807/http://local.wasp.uwa.edu.au/~pbourke/geometry/insidepoly/
459
+ '''
460
+ h, w = mask.shape
461
+ Y, X = torch.meshgrid(torch.arange(h), torch.arange(w), indexing='ij') # or xy??? xy is the numpy was
462
+ grid_coords = torch.stack([X.flatten(), Y.flatten()], dim=1).float().to(mask.device)
463
+
464
+ new_mask = torch.ones(h, w, device=mask.device)
465
+ zeros = torch.zeros(h, w, device=mask.device)
466
+ ones = torch.ones(h, w, device=mask.device)
467
+
468
+ # For some reason it is easier for me to comprehend the algorithm if we iterate counter-clockwise
469
+ for i in range(len(polygon)):
470
+ v1 = polygon[i]
471
+ v2 = polygon[(i + 1) % len(polygon)]
472
+
473
+ # Determine the direction of the edge
474
+ edge_direction = v2 - v1
475
+
476
+ # Given a line segment between P0 (x0,y0) and P1 (x1,y1), another point P (x,y) has the following relationship to the line segment.
477
+ # Compute
478
+ # (y - y0) (x1 - x0) - (x - x0) (y1 - y0)
479
+ # Check if the point is to the left of the edge
480
+ points = (grid_coords[:, 0] - v1[0]) * edge_direction[1] - (grid_coords[:, 1] - v1[1]) * edge_direction[0]
481
+ # we can do the threshold in a clever differentiable way
482
+ # this sets all values to be between 0 and 1
483
+ is_left = torch.min(torch.max(points.view(h, w), zeros), ones)
484
+
485
+ # do the intersection of the 2 masks, this progressily builds op the polygon
486
+ new_mask = new_mask * is_left
487
+
488
+ return new_mask
489
+
490
+ def convex_hull(mask, coords):
491
+ hull = jarvis_march(coords)
492
+ new_mask = fill_polygon(mask, hull)
493
+ return new_mask
494
+
495
+ if __name__ == '__main__':
496
+ import matplotlib.pyplot as plt
497
+ mask = torch.zeros(700, 700, dtype=torch.bool)
498
+ # p = torch.tensor([[5,6],[21.0,7],[21,20],[10,20],[15,20],[5,20],[11,8],[15,15],[17,6],[11,15]])
499
+
500
+ p = torch.tensor([[271.0000, 356.0000],
501
+ [ 25.3744, 356.0000],
502
+ [ 0.0000, 356.0000],
503
+ [ 0.0000, 89.5266],
504
+ [271.0000, 159.3112],
505
+ [ 95.5653, 201.7484],
506
+ [ 0.0000, 0.0000],
507
+ [271.0000, 0.0000]])
508
+
509
+ p2 = torch.tensor([[150.3456, 0.0000],
510
+ [479.0000, 0.0000],
511
+ [ 11.8427, 0.0000],
512
+ [ 0.0000, 0.0000],
513
+ [121.4681, 232.5976],
514
+ [375.6230, 383.9329],
515
+ [ 12.8765, 630.0000],
516
+ [ 0.0000, 344.7250]])
517
+
518
+ p3 = torch.tensor([[290.9577, 171.1176],
519
+ [197.7348, 483.7612],
520
+ [383.0000, 504.0000],
521
+ [383.0000, 27.6211],
522
+ [ 2.2419, 52.6505],
523
+ [ 0.0000, 399.6908],
524
+ [ 0.0000, 504.0000],
525
+ [ 0.0000, 0.0000]])
526
+
527
+ p4 = torch.tensor([[271.0000, 19.5241],
528
+ [271.0000, 356.0000],
529
+ [ 0.0000, 0.0000],
530
+ [271.0000, 0.0000],
531
+ [ 0.0000, 0.0000],
532
+ [163.0264, 77.9408],
533
+ [164.2467, 321.0222],
534
+ [ 0.0000, 356.0000],
535
+ [ 0.0000, 0.0000]])
536
+
537
+ p5 = torch.tensor([[272.0000, 1.0000],
538
+ [ 0.0000, 173.5156],
539
+ [ 74.8860, 141.3913],
540
+ [253.8221, 0.0000],
541
+ [271.0000, 0.0000],
542
+ [271.0000, 356.0000],
543
+ [262.5294, 327.9978],
544
+ [271.0000, 120.8048]])
545
+
546
+ mask5 = convex_hull(mask, p5)
547
+ mask4 = convex_hull(mask, p4)
548
+ mask1 = convex_hull(mask, p)
549
+ mask2 = convex_hull(mask, p2)
550
+ mask3 = convex_hull(mask, p3)
551
+ fig, ax = plt.subplots(1,5, figsize=(20,5))
552
+ ax[0].scatter(p[:,0], p[:,1], c='r')
553
+ ax[1].scatter(p2[:,0], p2[:,1], c='b')
554
+ ax[2].scatter(p3[:,0], p3[:,1], c='g')
555
+ ax[3].scatter(p4[:,0], p4[:,1], c='y')
556
+ ax[4].scatter(p5[:,0], p5[:,1], c='m')
557
+
558
+ ax[0].imshow(mask1)
559
+ ax[1].imshow(mask2)
560
+ ax[2].imshow(mask3)
561
+ ax[3].imshow(mask4)
562
+ ax[4].imshow(mask5)
563
+ plt.show()
564
+ a = 2
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
  title: Weak Cube RCNN
3
- emoji:
4
  colorFrom: indigo
5
- colorTo: purple
6
  sdk: docker
7
  pinned: false
8
- license: cc-by-nc-sa-4.0
9
- short_description: Weak Cube RCNN model
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Weak Cube RCNN
3
+ emoji: 🎲
4
  colorFrom: indigo
5
+ colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
+ license: apache-2.0
 
9
  ---
10
 
11
+ https://github.com/AndreasLH/Weak-Cube-R-CNN
VisualiseGT.py ADDED
@@ -0,0 +1,830 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pycocotools.coco import COCO
2
+ import os
3
+ import random
4
+ from functools import reduce
5
+ from io import StringIO
6
+
7
+ from detectron2.utils.visualizer import Visualizer
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ import pandas as pd
11
+ from scipy import stats
12
+
13
+ from cubercnn import data, util, vis
14
+ from cubercnn.config import get_cfg_defaults
15
+ from cubercnn.data.build import (build_detection_test_loader,
16
+ build_detection_train_loader)
17
+ from cubercnn.data.dataset_mapper import DatasetMapper3D
18
+ from cubercnn.data.datasets import load_omni3d_json, simple_register
19
+ from detectron2.config import get_cfg
20
+ from detectron2.data import DatasetCatalog, MetadataCatalog
21
+ from detectron2.structures.boxes import BoxMode
22
+ from detectron2.utils.logger import setup_logger
23
+
24
+ color = '#384860'
25
+ second_color = '#97a6c4'
26
+
27
+ def load_gt(dataset='SUNRGBD', mode='test', single_im=True, filter=False, img_idx=150):
28
+
29
+ # we can do this block of code to get the categories reduced number of categories in the sunrgbd dataset as there normally is 83 categories, however we only work with 38.
30
+ config_file = 'configs/Base_Omni3D.yaml'
31
+ if filter:
32
+ cfg, filter_settings = get_config_and_filter_settings(config_file)
33
+ else:
34
+ filter_settings = None
35
+
36
+ if mode == 'test':
37
+ dataset_paths_to_json = ['datasets/Omni3D/'+dataset+'_test.json']
38
+ elif mode == 'train':
39
+ dataset_paths_to_json = ['datasets/Omni3D/'+dataset+'_train.json']
40
+
41
+ # Get Image and annotations
42
+ try:
43
+ dataset = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
44
+ except:
45
+ print('Dataset does not exist or is not in the correct format!')
46
+ exit()
47
+ imgIds = dataset.getImgIds()
48
+ imgs = dataset.loadImgs(imgIds)
49
+ if single_im:
50
+ # img = random.choice(imgs)
51
+ # 730 and 150 are used in the report
52
+ img = imgs[img_idx]
53
+ annIds = dataset.getAnnIds(imgIds=img['id'])
54
+ else:
55
+ # get all annotations
56
+ img = imgs
57
+ annIds = dataset.getAnnIds()
58
+
59
+ anns = dataset.loadAnns(annIds)
60
+
61
+ # Extract necessary annotations
62
+ R_cams = []
63
+ center_cams = []
64
+ dimensions_all = []
65
+ cats = []
66
+ bboxes = []
67
+ for instance in anns:
68
+ if 'bbox2D_tight' in instance and instance['bbox2D_tight'][0] != -1:
69
+ bboxes.append(instance['bbox2D_tight']) # boxes are XYXY_ABS by default
70
+
71
+ elif 'bbox2D_trunc' in instance and not np.all([val==-1 for val in instance['bbox2D_trunc']]):
72
+ bboxes.append(instance['bbox2D_trunc']) # boxes are XYXY_ABS by default
73
+
74
+ elif 'bbox2D_proj' in instance:
75
+ bboxes.append(instance['bbox2D_proj']) # boxes are XYXY_ABS by default
76
+
77
+ else:
78
+ continue
79
+
80
+ R_cams.append(instance['R_cam'])
81
+ center_cams.append(instance['center_cam'])
82
+ dimensions_all.append(instance['dimensions'])
83
+ cats.append(instance['category_name'])
84
+
85
+ return img, R_cams, center_cams, dimensions_all, cats, bboxes
86
+
87
+
88
+
89
+ def plot_scene(image_path, output_dir, center_cams, dimensions_all, Rs, K, cats, bboxes):
90
+ # TODO: currently this function does not filter out invalid annotations, but it should have the option to do so.
91
+ # Compute meshes
92
+ meshes = []
93
+ meshes_text = []
94
+ for idx, (center_cam, dimensions, pose, cat) in enumerate(zip(
95
+ center_cams, dimensions_all, Rs, cats
96
+ )):
97
+ bbox3D = center_cam + dimensions
98
+ meshes_text.append('{}'.format(cat))
99
+ color = [c/255.0 for c in util.get_color(idx)]
100
+ box_mesh = util.mesh_cuboid(bbox3D, pose, color=color)
101
+ meshes.append(box_mesh)
102
+
103
+ image_name = util.file_parts(image_path)[1]
104
+ print('File: {} with {} dets'.format(image_name, len(meshes)))
105
+ np.random.seed(0)
106
+ colors = [np.concatenate([np.random.random(3), np.array([0.6])], axis=0) for _ in range(len(meshes))]
107
+
108
+ # Plot
109
+ image = util.imread('datasets/'+image_path)
110
+ if len(meshes) > 0:
111
+ im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(image, np.array(K), meshes, colors=colors, text=meshes_text, scale=image.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
112
+
113
+ if False:
114
+ im_concat = np.concatenate((im_drawn_rgb, im_topdown), axis=1)
115
+ vis.imshow(im_concat)
116
+
117
+ util.imwrite(im_drawn_rgb, os.path.join(output_dir, image_name+'_boxes.jpg'))
118
+ util.imwrite(im_topdown, os.path.join(output_dir, image_name+'_novel.jpg'))
119
+ v_pred = Visualizer(image, None)
120
+ #bboxes = [[320, 150, 560, 340]] # low loss
121
+ #bboxes = [[350, 220, 440, 290]] # high loss
122
+ #bboxes = [[340, 163, 540, 297]] # fail loss
123
+ v_pred = v_pred.overlay_instances(boxes=np.array(bboxes), assigned_colors=colors)#[np.array([0.5,0,0.5])])#colors)
124
+ util.imwrite(v_pred.get_image(), os.path.join(output_dir, image_name+'_pred_boxes.jpg'))
125
+
126
+ #im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(v_pred.get_image(), np.array(K), meshes, colors=colors, text=meshes_text, scale=image.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
127
+ #util.imwrite(im_drawn_rgb, os.path.join(output_dir, image_name+'_boxes_with_2d.jpg'))
128
+ else:
129
+ print('No meshes')
130
+ util.imwrite(image, os.path.join(output_dir, image_name+'_boxes.jpg'))
131
+
132
+
133
+
134
+ def show_data(dataset, filter_invalid=False, output_dir='output/playground'):
135
+ # Load Image and Ground Truths
136
+ image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, filter=filter_invalid)
137
+
138
+ # Create Output Directory
139
+ util.mkdir_if_missing(output_dir)
140
+
141
+ plot_scene(image['file_path'], output_dir, center_cams, dimensions_all, Rs, image['K'], cats, bboxes)
142
+
143
+
144
+ def category_distribution(dataset):
145
+ '''Plot a histogram of the category distribution in the dataset.'''
146
+ # Load Image and Ground Truths
147
+ image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
148
+ image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
149
+ config_file = 'configs/Base_Omni3D.yaml'
150
+ cfg, filter_settings = get_config_and_filter_settings(config_file)
151
+ annotation_file = 'datasets/Omni3D/SUNRGBD_train.json'
152
+ coco_api = COCO(annotation_file)
153
+ meta = MetadataCatalog.get('SUNRGBD')
154
+ cat_ids = sorted(coco_api.getCatIds(filter_settings['category_names']))
155
+ cats_sun = coco_api.loadCats(cat_ids)
156
+ thing_classes = [c["name"] for c in sorted(cats_sun, key=lambda x: x["id"])]
157
+
158
+ output_dir = 'output/figures/' + dataset
159
+ util.mkdir_if_missing(output_dir)
160
+
161
+ # histogram of categories
162
+ cats_all = cats + cats_t
163
+ # cats_unique = list(set(cats_all))
164
+ cats_unique = thing_classes
165
+ print('cats unique: ', len(cats_unique))
166
+ # make dict with count of each category
167
+ cats_count = {cat: cats_all.count(cat) for cat in cats_unique}
168
+ cats_sorted = dict(sorted(cats_count.items(), key=lambda x: x[1], reverse=True))
169
+
170
+ plt.figure(figsize=(14,5))
171
+ plt.bar(cats_sorted.keys(), cats_sorted.values())
172
+ plt.xticks(rotation=60, size=9)
173
+
174
+ plt.title('Category Distribution')
175
+ plt.savefig(os.path.join(output_dir, 'category_distribution.png'),dpi=300, bbox_inches='tight')
176
+ plt.close()
177
+
178
+ return cats_sorted
179
+
180
+ def spatial_statistics(dataset):
181
+ '''Compute spatial statistics of the dataset.
182
+ wanted to reproduce fig. 7 from the omni3D paper
183
+ however, we must standardise the images for it to work
184
+ '''
185
+ # Load Image and Ground
186
+ # this function filters out invalid images if there are no valid annotations in the image
187
+ # annnotations in each image can also be marked as is_ignore => True
188
+ image_root = 'datasets'
189
+ cfg, filter_settings = get_config_and_filter_settings()
190
+ dataset_names = ['SUNRGBD_train','SUNRGBD_test','SUNRGBD_val']
191
+ output_dir = 'output/figures/' + dataset
192
+
193
+ # this is almost the same as the simple_register function, but it also stores the model metadata
194
+ # which is needed for the load_omni3d_json function
195
+ data.register_and_store_model_metadata(None, output_dir, filter_settings=filter_settings)
196
+
197
+ data_dicts = []
198
+ for dataset_name in dataset_names:
199
+ json_file = 'datasets/Omni3D/'+dataset_name+'.json'
200
+ data_dict = load_omni3d_json(json_file, image_root, dataset_name, filter_settings, filter_empty=True)
201
+ data_dicts.extend(data_dict)
202
+
203
+
204
+ # standardise the images to a fixed size
205
+ # and map the annotations to the standardised images
206
+ std_image_size = (480//4, 640//4)
207
+ tot_outliers = 0
208
+ img = np.zeros(std_image_size)
209
+ for img_dict in data_dicts:
210
+ original_width = img_dict['width']
211
+ original_height = img_dict['height']
212
+
213
+ # Calculate the scale factor for resizing
214
+ scale_x = std_image_size[1] / original_width
215
+ scale_y = std_image_size[0] / original_height
216
+
217
+ # Update the image size in the annotation
218
+ img_dict['width'] = std_image_size[1]
219
+ img_dict['height'] = std_image_size[0]
220
+ for anno in img_dict['annotations']:
221
+ if not anno['ignore']:
222
+ # Update the 2D box coordinates (boxes are XYWH)
223
+ anno['bbox2D_tight'][0] *= scale_x
224
+ anno['bbox2D_tight'][1] *= scale_y
225
+ anno['bbox2D_tight'][2] *= scale_x
226
+ anno['bbox2D_tight'][3] *= scale_y
227
+ # get the centerpoint of the annotation as (x, y)
228
+ # x0, y0, x1, y1 = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
229
+ x0, y0, x1, y1 = anno['bbox2D_tight']
230
+ x_m, y_m = int((x0+x1)/2), int((y0+y1)/2)
231
+ if x_m >= std_image_size[1] or x_m < 0:
232
+ # print(f'x out of line {x_m}')
233
+ tot_outliers += 1
234
+ elif y_m >= std_image_size[0] or y_m < 0:
235
+ # print(f'y out of line {y_m}')
236
+ tot_outliers += 1
237
+ else:
238
+ img[y_m, x_m] += 1
239
+ else:
240
+ # Remove the annotation if it is marked as ignore
241
+ img_dict['annotations'].remove(anno)
242
+
243
+
244
+ print('num center points outside frame: ', tot_outliers)
245
+ img = img/img.max()
246
+ # this point is so large that all the points become invisible, so I remove it.
247
+ img[0,0] = 0.00
248
+ img = img/img.max()
249
+ plt.figure()
250
+ plt.imshow(img, cmap='gray_r', vmin=0, vmax=1)
251
+ plt.xticks([]); plt.yticks([])
252
+ plt.title('Histogram of 2D box centre points')
253
+ # plt.box(False)
254
+ plt.savefig(os.path.join(output_dir, '2d_histogram.png'),dpi=300, bbox_inches='tight')
255
+ plt.close()
256
+ return
257
+
258
+ def AP_vs_no_of_classes(dataset, files:list=['output/Baseline_sgd/log.txt','output/omni_equalised/log.txt','output/omni_pseudo_gt/log.txt','output/proposal_AP/log.txt','output/exp_10_iou_zpseudogt_dims_depthrange_rotalign_ground/log.txt']):
259
+ '''Search the log file for the precision numbers corresponding to the last iteration
260
+ then parse it in as a pd.DataFrame and plot the AP vs number of classes'''
261
+ # search the file from the back until the line
262
+ # cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:
263
+ # is found
264
+
265
+ target_line = "cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:"
266
+ model_names = ['Base Cube R-CNN', 'Time-eq.', 'Pseudo GT', 'Proposal', 'Weak loss']
267
+ df = []
268
+ for file, model_name in zip(files, model_names):
269
+ df_i = search_file_backwards(file, target_line).rename(columns={'AP3D':f'{model_name} AP3D', 'AP2D':f'{model_name} AP2D'})
270
+ assert df_i is not None, 'df not found'
271
+ df.append(df_i)
272
+ # merge df's
273
+ df = reduce(lambda x, y: pd.merge(x, y, on = 'category'), df)
274
+ # sort df by ap3d of model 1
275
+ df = df.sort_values(by='Base Cube R-CNN AP3D', ascending=False)
276
+
277
+ cats = category_distribution(dataset)
278
+ df.sort_values(by='category', inplace=True)
279
+ cats = dict(sorted(cats.items()))
280
+ merged_df = pd.merge(df.reset_index(), pd.DataFrame(cats.values(), columns=['cats']), left_index=True, right_index=True)
281
+ merged_df = merged_df.sort_values(by='cats')
282
+ merged_df = merged_df.drop('index',axis=1)
283
+ merged_df = merged_df.reset_index(drop=True)
284
+
285
+
286
+ fig, ax = plt.subplots(figsize=(12,8))
287
+ for model_name in model_names:
288
+ if model_name == 'Base Cube R-CNN':
289
+ scale = 114
290
+ else:
291
+ scale = 10.15
292
+ # convert the annotation time to hours
293
+ time = merged_df['cats']*scale / 60 / 60
294
+ ax.scatter(time, merged_df[f'{model_name} AP3D'].values, s=merged_df[f'{model_name} AP2D'].values*2, alpha=0.5, label=model_name)
295
+
296
+ for i, txt in enumerate(merged_df['category']):
297
+ ax.text(time[i], merged_df[f'{model_name} AP3D'].values[i], txt, fontsize=merged_df[f'{model_name} AP3D'].values[i]*0.3+3)
298
+
299
+ correlation_coef = np.corrcoef(time, merged_df[f'{model_name} AP3D'].values)[0, 1]
300
+ line_fit = np.polyfit(time, merged_df[f'{model_name} AP3D'].values, 1)
301
+
302
+ # plot the line of best fit
303
+ ax.plot(time, np.poly1d(line_fit)(time), linestyle='--',alpha=0.5, label=f'Linear fit (R={correlation_coef:.2f})')
304
+
305
+ # Set labels and title
306
+ ax.set_xlabel('Annotation time (h)')
307
+ ax.set_ylabel('AP3D')
308
+ ax.set_xscale('log')
309
+ ax.set_title('AP3D vs class-wise annotation time')
310
+ ax.legend(title='AP3D scaled by AP2D')
311
+
312
+ # Save the plot
313
+ plt.savefig('output/figures/'+dataset+'/AP_vs_no_of_classes_all.png', dpi=300, bbox_inches='tight')
314
+ plt.close()
315
+
316
+ return
317
+
318
+ def AP3D_vs_AP2D(dataset, mode = 'standard', files=['output/Baseline_sgd/log.txt','output/omni_equalised/log.txt','output/omni_pseudo_gt/log.txt','output/proposal_AP/log.txt','output/exp_10_iou_zpseudogt_dims_depthrange_rotalign_ground/log.txt']):
319
+ '''Search the log file for the precision numbers corresponding to the last iteration
320
+ then parse it in as a pd.DataFrame and plot the AP vs number of classes'''
321
+
322
+ # search the file from the back until the line
323
+ # cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:
324
+ # is found
325
+
326
+ target_line = "cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:"
327
+ model_names = ['Base Cube R-CNN', 'Time-eq.', 'Pseudo GT', 'Proposal', 'Weak loss']
328
+ df = []
329
+ for file, model_name in zip(files, model_names):
330
+ df_i = search_file_backwards(file, target_line).rename(columns={'AP3D':f'{model_name} AP3D', 'AP2D':f'{model_name} AP2D'})
331
+ assert df_i is not None, 'df not found'
332
+ df.append(df_i)
333
+ # merge df's
334
+ df = reduce(lambda x, y: pd.merge(x, y, on = 'category'), df)
335
+ # sort df by ap3d of model 1
336
+ df = df.sort_values(by='Base Cube R-CNN AP3D', ascending=False)
337
+
338
+ cats = category_distribution(dataset)
339
+ df.sort_values(by='category', inplace=True)
340
+ cats = dict(sorted(cats.items()))
341
+ merged_df = pd.merge(df.reset_index(), pd.DataFrame(cats.values(), columns=['cats']), left_index=True, right_index=True)
342
+ merged_df = merged_df.sort_values(by='cats')
343
+ merged_df = merged_df.drop('index',axis=1)
344
+ merged_df = merged_df.reset_index(drop=True)
345
+
346
+ # mode = 'standard' # 'log'
347
+
348
+ fig, ax = plt.subplots(figsize=(12,8))
349
+ for model_name in model_names:
350
+ if mode == 'standard': s=merged_df[f'{model_name} AP2D'].values*2
351
+ else: s = None
352
+ # we have to add 0.001 to the values to avoid log(0) errors
353
+ ax.scatter(merged_df[f'{model_name} AP2D'].values+0.001, merged_df[f'{model_name} AP3D'].values+0.001, alpha=0.5, label=model_name, s=s)
354
+ for i, txt in enumerate(merged_df['category']):
355
+ if mode == 'standard': fontsize=merged_df[f'{model_name} AP3D'].values[i]*0.3+3
356
+ else: fontsize=7
357
+ ax.text(merged_df[f'{model_name} AP2D'].values[i]+0.001, merged_df[f'{model_name} AP3D'].values[i]+0.001, txt,fontsize=fontsize)
358
+ # plot average line
359
+ ax.plot((0, 70), (0, 70), linestyle='--', color=color, alpha=0.3, label=f'AP2D=AP3D')
360
+
361
+ # Set labels and title
362
+ if mode == 'log':
363
+ ax.set_xscale('log')
364
+ ax.set_yscale('log')
365
+ ax.set_xlabel('AP2D')
366
+ ax.set_ylabel('AP3D')
367
+ # ax.set_xlim(0.1, 75); ax.set_ylim(0.1, 75)
368
+ ax.set_title('AP in 3D vs AP in 2D')
369
+ ax.legend()
370
+ # if mode == 'log':
371
+ # # for some obscure reason the log plot fails to save
372
+ # plt.show()
373
+
374
+ # # Save the plot
375
+ # else:
376
+ plt.savefig('output/figures/'+dataset+f'/AP3D_vs_AP2D_all_{mode}.png', dpi=300, bbox_inches='tight')
377
+ plt.close()
378
+
379
+ return
380
+
381
+
382
+ def search_file_backwards(file_path:str, target_line:str) -> pd.DataFrame:
383
+ '''Search a file backwards for a target line and return the table of the performance of the model. The point of this is to parse the part of the log file that looks like this
384
+ | category | AP2D | AP3D | category | AP2D | AP3D | category | AP2D | AP3D |
385
+ |:----------:|:--------|:----------|:-----------:|:---------|:---------|:------------:|:----------|:-----------|
386
+ | chair | 45.9374 | 53.4913 | table | 34.5982 | 39.7769 | cabinet | 16.3693 | 14.0878 |
387
+ | lamp | 24.8081 | 7.67653 | books | 0.928978 | 0.599711 | sofa | 49.2354 | 57.9649 |
388
+
389
+ ...
390
+ To a pandas DataFrame that has 3 columns: category, AP2D, AP3D'''
391
+ import re
392
+ with open(file_path, 'r') as file:
393
+ lines = file.readlines()
394
+ for i, line in enumerate(reversed(lines)):
395
+ is_found = re.search(f'.*{target_line}$', line)
396
+ if is_found:
397
+ table = lines[-i:-i+15]
398
+ tab_as_str= ' '.join(table)
399
+ # i know this is really ugly
400
+ df = pd.read_csv( StringIO(tab_as_str.replace(' ', '')), # Get rid of whitespaces
401
+ sep='|',).dropna(axis=1, how='all').drop(0)
402
+ # https://stackoverflow.com/a/65884212
403
+ df.columns = pd.MultiIndex.from_frame(df.columns.str.split('.', expand=True)
404
+ .to_frame().fillna('0'))
405
+ df = df.stack().reset_index(level=1, drop=True).reset_index().drop('index', axis=1)
406
+ df['AP3D'] = df['AP3D'].astype(float)
407
+ df['AP2D'] = df['AP2D'].astype(float)
408
+
409
+ return df
410
+
411
+ return None
412
+
413
+
414
+ def get_config_and_filter_settings(config_file='configs/Base_Omni3D.yaml'):
415
+ # we must load the config file to get the filter settings
416
+ cfg = get_cfg()
417
+ get_cfg_defaults(cfg)
418
+ cfg.merge_from_file(config_file)
419
+ # must setup logger to get info about filtered out annotations
420
+ setup_logger(output=cfg.OUTPUT_DIR, name="cubercnn")
421
+ filter_settings = data.get_filter_settings_from_cfg(cfg)
422
+ return cfg, filter_settings
423
+
424
+
425
+ def init_dataloader():
426
+ ''' dataloader stuff.
427
+ currently not used anywhere, because I'm not sure what the difference between the omni3d dataset and load omni3D json functions are. this is a 3rd alternative to this. The train script calls something similar to this.'''
428
+ cfg, filter_settings = get_config_and_filter_settings()
429
+
430
+ dataset_names = ['SUNRGBD_train','SUNRGBD_val']
431
+ dataset_paths_to_json = ['datasets/Omni3D/'+dataset_name+'.json' for dataset_name in dataset_names]
432
+ for dataset_name in dataset_names:
433
+ simple_register(dataset_name, filter_settings, filter_empty=True)
434
+
435
+ # Get Image and annotations
436
+ datasets = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
437
+ data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
438
+
439
+ thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
440
+ dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
441
+
442
+ infos = datasets.dataset['info']
443
+
444
+ dataset_id_to_unknown_cats = {}
445
+ possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
446
+
447
+ dataset_id_to_src = {}
448
+
449
+ for info in infos:
450
+ dataset_id = info['id']
451
+ known_category_training_ids = set()
452
+
453
+ if not dataset_id in dataset_id_to_src:
454
+ dataset_id_to_src[dataset_id] = info['source']
455
+
456
+ for id in info['known_category_ids']:
457
+ if id in dataset_id_to_contiguous_id:
458
+ known_category_training_ids.add(dataset_id_to_contiguous_id[id])
459
+
460
+ # determine and store the unknown categories.
461
+ unknown_categories = possible_categories - known_category_training_ids
462
+ dataset_id_to_unknown_cats[dataset_id] = unknown_categories
463
+
464
+ from detectron2 import data as d2data
465
+ NoOPaug = d2data.transforms.NoOpTransform()
466
+
467
+ # def NoOPaug(input):
468
+ # return input
469
+ # TODO: how to load in images without having them resized?
470
+ # data_mapper = DatasetMapper3D(cfg, augmentations=[NoOPaug], is_train=True)
471
+ data_mapper = DatasetMapper3D(cfg, is_train=True)
472
+ # test loader does resize images, like the train loader does
473
+ # this is the function that filters out the invalid annotations
474
+ data_loader = build_detection_train_loader(cfg, mapper=data_mapper, dataset_id_to_src=dataset_id_to_src, num_workers=1)
475
+ # data_loader = build_detection_test_loader(cfg, dataset_names[1], num_workers=1)
476
+
477
+ # this is a detectron 2 thing that we just have to do
478
+ data_mapper.dataset_id_to_unknown_cats = dataset_id_to_unknown_cats
479
+
480
+
481
+ for item in data_loader:
482
+ print(item)
483
+
484
+ def vol_over_cat(dataset):
485
+ '''
486
+ Errorbarplot of volume of object category
487
+ '''
488
+ # Load Image and Ground Truths
489
+ image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
490
+ image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
491
+
492
+ output_dir = 'output/figures/' + dataset
493
+ util.mkdir_if_missing(output_dir)
494
+
495
+ # histogram of categories
496
+ cats_all = cats + cats_t
497
+ cats_unique = list(set(cats_all))
498
+
499
+ # Create dictionary with np.prod(dimensions) for each category
500
+ cats_vol = {cat: [] for cat in cats_unique}
501
+ for cat, dims in zip(cats, dimensions_all):
502
+ if np.prod(dims) > 0:
503
+ cats_vol[cat].append(np.prod(dims))
504
+ for cat, dims in zip(cats_t, dimensions_all_t):
505
+ if np.prod(dims) > 0:
506
+ cats_vol[cat].append(np.prod(dims))
507
+
508
+ # make dict with mean and std of each category
509
+ cats_mean = {cat: np.mean(cats_vol[cat]) for cat in cats_unique}
510
+ cats_error = {cat: np.std(cats_vol[cat]) for cat in cats_unique}
511
+
512
+ keys = np.array(list(cats_mean.keys()))
513
+ means = np.array(list(cats_mean.values()))
514
+ errors = np.array(list(cats_error.values()))
515
+
516
+ # Calculate Z-scores for 5th and 95th percentiles
517
+ from scipy.stats import norm
518
+ z_lower = norm.ppf(0.05)
519
+ z_upper = norm.ppf(0.95)
520
+ bounds = []
521
+ for mean, std in zip(means, errors):
522
+ # Calculate the lower and upper bounds of the interval
523
+ lower_bound = mean + z_lower * std
524
+ upper_bound = mean + z_upper * std
525
+
526
+ bounds.append((max(0,lower_bound), upper_bound))
527
+
528
+ plt.figure(figsize=(14,5))
529
+ for i, (mean, (lower_bound, upper_bound)) in enumerate(zip(means, bounds)):
530
+ plt.vlines(x=i, ymin=lower_bound, ymax=upper_bound, color='gray', linewidth=2)
531
+ plt.plot([i], [mean], marker='o', color=color)
532
+
533
+ plt.xticks(np.arange(len(keys)), keys, rotation=60, size=9)
534
+ plt.xlabel('Category')
535
+ plt.ylabel('Volume')
536
+ plt.title('Category Distribution')
537
+ plt.savefig(os.path.join(output_dir, 'volume_distribution.png'), dpi=300, bbox_inches='tight')
538
+ plt.close()
539
+
540
+ def gt_stats(dataset):
541
+ '''
542
+ Errorbarplot of volume of object category
543
+ '''
544
+ # Load Image and Ground Truths
545
+ image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
546
+ image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
547
+
548
+ output_dir = 'output/figures/' + dataset
549
+ util.mkdir_if_missing(output_dir)
550
+
551
+ # histogram of centers
552
+ center_all = center_cams + center_cams_t
553
+ center_all = np.transpose(np.array(center_all))
554
+
555
+ # Filter -1 annotations
556
+ valid_columns = center_all[0] != -1
557
+ center_all = center_all[:,valid_columns]
558
+
559
+ x_label = ['x', 'y', 'z']
560
+ fig, axes = plt.subplots(1, len(center_all), figsize=(18, 5))
561
+ for i in range(len(center_all)):
562
+ axes[i].hist(center_all[i], color=color, bins=20)
563
+ axes[i].set_xlabel(x_label[i])
564
+ axes[i].set_ylabel('Count')
565
+ fig.suptitle('Center Distribution in Meters')
566
+ plt.savefig(os.path.join(output_dir, 'center.png'), dpi=300, bbox_inches='tight')
567
+ plt.close()
568
+
569
+ # histogram of dimensions
570
+ dimensions_all = dimensions_all + dimensions_all_t
571
+ dimensions_all = np.transpose(np.array(dimensions_all))
572
+
573
+ # Filter -1 annotations
574
+ valid_columns = dimensions_all[0] != -1
575
+ dimensions_all = dimensions_all[:,valid_columns]
576
+
577
+ x_label = ['w', 'h', 'l']
578
+ fig, axes = plt.subplots(1, len(dimensions_all), figsize=(18, 5))
579
+ for i in range(len(dimensions_all)):
580
+ axes[i].hist(dimensions_all[i], color=color, bins=20)
581
+ axes[i].set_xlabel(x_label[i])
582
+ axes[i].set_ylabel('Count')
583
+ fig.suptitle('Dimensions Distribution in Meters')
584
+ plt.savefig(os.path.join(output_dir, 'dimensions.png'), dpi=300, bbox_inches='tight')
585
+ plt.close()
586
+
587
+ def report_figures(dataset, filter_invalid=False, output_dir='output/report_images'):
588
+ # Create Output Directory
589
+ util.mkdir_if_missing(output_dir)
590
+ util.mkdir_if_missing(output_dir+'/low_green')
591
+ util.mkdir_if_missing(output_dir+'/high_green')
592
+ util.mkdir_if_missing(output_dir+'/fail_green')
593
+ util.mkdir_if_missing(output_dir+'/low_red')
594
+ util.mkdir_if_missing(output_dir+'/high_red')
595
+ util.mkdir_if_missing(output_dir+'/fail_red')
596
+ util.mkdir_if_missing(output_dir+'/low_blue')
597
+ util.mkdir_if_missing(output_dir+'/high_blue')
598
+ util.mkdir_if_missing(output_dir+'/fail_blue')
599
+
600
+ # Load Image and Ground Truths
601
+ image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, filter=filter_invalid, img_idx=352)
602
+
603
+ gt_center = center_cams[1:]
604
+ gt_dim = dimensions_all[1:]
605
+ gt_Rs = Rs[1:]
606
+ cats = cats[1:]
607
+ gt_bb = bboxes[1:]
608
+
609
+ # Make low loss boxes for IoU, ps. z and proj
610
+ center = gt_center[-1]
611
+ dim = gt_dim[-1]
612
+ R = gt_Rs[-1]
613
+ cat = cats[-1]
614
+ bb = gt_bb[-1]
615
+ plot_scene(image['file_path'], output_dir+'/low_green', [center], [dim], [R], image['K'], [cat], [bb])
616
+
617
+ # Make high loss boxes for IoU, ps. z and proj
618
+ center = [gt_center[-1][0],gt_center[-1][1],gt_center[-1][2]+3]
619
+ dim = gt_dim[-1]
620
+ R = gt_Rs[-1]
621
+ cat = cats[-1]
622
+ bb = gt_bb[-1]
623
+ plot_scene(image['file_path'], output_dir+'/high_green', [center], [dim], [R], image['K'], [cat], [bb])
624
+
625
+ # Make fail loss boxes for IoU, ps. z and proj
626
+ center = [gt_center[-1][0]-0.03,gt_center[-1][1],gt_center[-1][2]]
627
+ dim = [0.05,0.71,0.05]
628
+ R = util.euler2mat(np.array([0,0,45]))
629
+ cat = cats[-1]
630
+ bb = gt_bb[-1]
631
+ plot_scene(image['file_path'], output_dir+'/fail_green', [center], [dim], [R], image['K'], [cat], [bb])
632
+
633
+ # Make low loss boxes for range and seg
634
+ center = gt_center[0]
635
+ dim = gt_dim[0]
636
+ R = gt_Rs[0]
637
+ cat = cats[0]
638
+ bb = gt_bb[0]
639
+ plot_scene(image['file_path'], output_dir+'/low_red', [center], [dim], [R], image['K'], [cat], [bb])
640
+
641
+ # Make high loss boxes for range and seg
642
+ center = [gt_center[0][0],gt_center[0][1]+0.3,gt_center[0][2]]
643
+ dim = [gt_dim[0][0]+1.5,gt_dim[0][1]-0.6,gt_dim[0][2]]
644
+ R = gt_Rs[0]
645
+ cat = cats[0]
646
+ bb = gt_bb[0]
647
+ plot_scene(image['file_path'], output_dir+'/high_red', [center], [dim], [R], image['K'], [cat], [bb])
648
+
649
+ # Make fail loss boxes for range and seg
650
+ center = [gt_center[0][0]+0.25,gt_center[0][1],gt_center[0][2]]
651
+ dim = [gt_dim[0][0]+0.7,gt_dim[0][1],gt_dim[0][2]]
652
+ R = gt_Rs[-1]
653
+ cat = cats[-1]
654
+ bb = gt_bb[-1]
655
+ plot_scene(image['file_path'], output_dir+'/fail_red', [center], [dim], [R], image['K'], [cat], [bb])
656
+
657
+ # Make low loss boxes for dim, pose and align
658
+ center = gt_center[1:]
659
+ dim = [[gt_dim[1][0]*1.5,gt_dim[1][1],gt_dim[1][2]*1.5], gt_dim[2]]
660
+ R = gt_Rs[1:]
661
+ cat = cats[1:]
662
+ bb = gt_bb[1:]
663
+ plot_scene(image['file_path'], output_dir+'/low_blue', center, dim, R, image['K'], cat, bb)
664
+
665
+ # Make high loss boxes for dim, pose and align
666
+ center = gt_center[1:]
667
+ dim = gt_dim[1:]
668
+ R = [util.euler2mat(util.mat2euler(np.array(gt_Rs[1]))+[20,0,0]), util.euler2mat(util.mat2euler(np.array(gt_Rs[2]))+[-20,0,0])]
669
+ cat = cats[1:]
670
+ bb = gt_bb[1:]
671
+ plot_scene(image['file_path'], output_dir+'/high_blue', center, dim, R, image['K'], cat, bb)
672
+
673
+ # Make fail loss boxes for dim, pose and align
674
+ center = gt_center[1:]
675
+ dim = [[gt_dim[1][0],gt_dim[1][1],gt_dim[1][2]],[gt_dim[2][1],gt_dim[2][0],gt_dim[2][2]]]
676
+ R = [util.euler2mat(util.mat2euler(np.array(gt_Rs[1]))+[1,0,0]), util.euler2mat(util.mat2euler(np.array(gt_Rs[2]))+[1,0,0])]
677
+ cat = cats[1:]
678
+ bb = gt_bb[1:]
679
+ plot_scene(image['file_path'], output_dir+'/fail_blue', center, dim, R, image['K'], cat, bb)
680
+
681
+ return True
682
+
683
+ def gt_stats_in_terms_of_sigma(dataset):
684
+ '''
685
+ Errorbarplot of volume of object category
686
+ '''
687
+ # Load Image and Ground Truths
688
+ image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
689
+ image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
690
+
691
+ output_dir = 'output/figures/' + dataset
692
+ util.mkdir_if_missing(output_dir)
693
+
694
+ # histogram of centers
695
+ center_all = center_cams + center_cams_t
696
+ center_all = np.transpose(np.array(center_all))
697
+
698
+ # Filter -1 annotations
699
+ valid_columns = center_all[0] != -1
700
+ center_all = center_all[:,valid_columns]
701
+
702
+ x_label = ['x', 'y', 'z']
703
+ fig, axes = plt.subplots(1, len(center_all), figsize=(18, 5))
704
+ for i in range(len(center_all)):
705
+ axes[i].hist(center_all[i], color=color, bins=20)
706
+ axes[i].set_xlabel(x_label[i])
707
+ axes[i].set_ylabel('Count')
708
+ fig.suptitle('Center Distribution in Meters')
709
+ plt.savefig(os.path.join(output_dir, 'center.png'), dpi=300, bbox_inches='tight')
710
+ plt.close()
711
+
712
+ # histogram of dimensions
713
+ dimensions_all = dimensions_all + dimensions_all_t
714
+ dimensions_all = np.transpose(np.array(dimensions_all))
715
+
716
+ # Filter -1 annotations
717
+ valid_columns = dimensions_all[0] != -1
718
+ dimensions_all = dimensions_all[:,valid_columns]
719
+
720
+ x_label = ['w', 'h', 'l']
721
+ fig, axes = plt.subplots(1, len(dimensions_all), figsize=(18, 5))
722
+ for i in range(len(dimensions_all)):
723
+ axes[i].hist(dimensions_all[i], color=color, bins=20, density=True)
724
+
725
+ # Plot normal distribution
726
+ mu, sigma = np.mean(dimensions_all[i]), np.std(dimensions_all[i])
727
+ x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)
728
+ axes[i].plot(x, stats.norm.pdf(x, mu, sigma))
729
+ y_lim = axes[i].get_ylim()[1]
730
+ axes[i].vlines(mu+sigma, 0, y_lim, linestyle='--', label='$\sigma$', color='gray')
731
+ axes[i].vlines(mu-sigma, 0, y_lim, linestyle='--', label='$\sigma$', color='gray')
732
+ axes[i].vlines(1.4, 0, y_lim, linestyle='--', color='red', label='pred')
733
+ if i != 0:
734
+ axes[i].plot((mu+sigma,1.4), (y_lim/2,y_lim/2), color='c', label='loss')
735
+ axes[i].set_xlabel(x_label[i])
736
+ axes[i].set_ylabel('density')
737
+ # Set xticks in terms of sigma
738
+ xticks = [mu - 3 * sigma, mu - 2 * sigma, mu - sigma, mu, mu + sigma, mu + 2 * sigma, mu + 3 * sigma, mu + 4 * sigma, mu + 5 * sigma, mu + 6 * sigma]
739
+ xticklabels = ['-3$\sigma$', '-2$\sigma$', '-$\sigma$', '0', '$\sigma$', '$2\sigma$', '$3\sigma$', '$4\sigma$', '$5\sigma$', '$6\sigma$']
740
+ axes[i].set_xticks(xticks)
741
+ axes[i].set_xticklabels(xticklabels)
742
+ axes[-1].legend()
743
+ fig.suptitle('Dimensions Distribution in Meters')
744
+ plt.savefig(os.path.join(output_dir, 'dimensions_sigma.png'), dpi=300, bbox_inches='tight')
745
+ plt.close()
746
+
747
+ return True
748
+
749
+ def parallel_coordinate_plot(dataset='SUNRGBD', files:list=['output/Baseline_sgd/log.txt','output/omni_equalised/log.txt','output/omni_pseudo_gt/log.txt','output/proposal_AP/log.txt','output/exp_10_iou_zpseudogt_dims_depthrange_rotalign_ground/log.txt']):
750
+ '''Search the log file for the precision numbers corresponding to the last iteration
751
+ then parse it in as a pd.DataFrame and plot the AP vs number of classes'''
752
+ import plotly.graph_objects as go
753
+
754
+ # df with each model as a column and performance for each class as rows
755
+ # search the file from the back until the line
756
+ # cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:
757
+ # is found
758
+ target_line = "cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:"
759
+ model_names = ['Base Cube R-CNN', 'Time-eq.', 'Pseudo GT', 'Proposal', 'Weak loss']
760
+ df = []
761
+ for file, model_name in zip(files, model_names):
762
+ df_i = search_file_backwards(file, target_line).drop(['AP2D'], axis=1).rename(columns={'AP3D':model_name})
763
+ assert df_i is not None, 'df not found'
764
+ df.append(df_i)
765
+ # merge df's
766
+ df = reduce(lambda x, y: pd.merge(x, y, on = 'category'), df)
767
+ # sort df by ap3d of model 1
768
+ df = df.sort_values(by='Base Cube R-CNN', ascending=False)
769
+ # encode each category as a number
770
+ df['category_num'] = list(reversed([i for i in range(len(df))]))
771
+
772
+ # https://plotly.com/python/parallel-coordinates-plot/
773
+ fig = go.Figure(data=
774
+ go.Parcoords(
775
+ line = dict(color = df.iloc[:, 1],
776
+ # colorscale = [[0,'purple'],[0.5,'lightseagreen'],[1,'gold']]),
777
+ colorscale = 'Viridis'),
778
+ visible = True,
779
+ dimensions = list([
780
+ dict(tickvals = df['category_num'],
781
+ ticktext = df['category'],
782
+ label = 'Categories', values = df['category_num']),
783
+ dict(range = [0,70],
784
+ constraintrange = [5,70],
785
+ label = model_names[0], values = df[model_names[0]]),
786
+ dict(range = [0,40],
787
+ label = model_names[2], values = df[model_names[2]]),
788
+ dict(range = [0,40],
789
+ label = model_names[4], values = df[model_names[4]]),
790
+ dict(range = [0,40],
791
+ label = model_names[1], values = df[model_names[1]]),
792
+ dict(range = [0,40],
793
+ label = model_names[3], values = df[model_names[3]]),
794
+ ]),
795
+ )
796
+ )
797
+
798
+ fig.update_layout(
799
+ plot_bgcolor = 'white',
800
+ paper_bgcolor = 'white',
801
+ title={
802
+ 'text': "AP3D per category for each model",
803
+ 'y':0.96,
804
+ 'x':0.5,
805
+ 'xanchor': 'center',
806
+ 'yanchor': 'top'},
807
+ margin=dict(l=65, r=25, t=80, b=5)
808
+ )
809
+ # pip install --upgrade "kaleido==0.1.*"
810
+ fig.write_image('output/figures/SUNRGBD/parallel_coordinate_plot.png', scale=3, format='png')
811
+ # fig.show()
812
+
813
+
814
+ if __name__ == '__main__':
815
+ # show_data('SUNRGBD', filter_invalid=False, output_dir='output/playground/no_filter') #{SUNRGBD,ARKitScenes,KITTI,nuScenes,Objectron,Hypersim}
816
+ # show_data('SUNRGBD', filter_invalid=True, output_dir='output/playground/with_filter') #{SUNRGBD,ARKitScenes,KITTI,nuScenes,Objectron,Hypersim}
817
+ # _ = category_distribution('SUNRGBD')
818
+ AP_vs_no_of_classes('SUNRGBD')
819
+ #spatial_statistics('SUNRGBD')
820
+ # AP3D_vs_AP2D('SUNRGBD')
821
+ # AP3D_vs_AP2D('SUNRGBD', mode='log')
822
+ # init_dataloader()
823
+ # vol_over_cat('SUNRGBD')
824
+ # gt_stats('SUNRGBD')
825
+ # gt_stats_in_terms_of_sigma('SUNRGBD')
826
+ #gt_stats('SUNRGBD')
827
+
828
+ # report_figures('SUNRGBD')
829
+
830
+ parallel_coordinate_plot()
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import gradio as gr
3
+ import os
4
+ import sys
5
+ import numpy as np
6
+ import torch
7
+
8
+ from detectron2.checkpoint import DetectionCheckpointer
9
+ from detectron2.config import get_cfg
10
+ from detectron2.data import transforms as T
11
+
12
+ sys.path.append(os.getcwd())
13
+ np.set_printoptions(suppress=True)
14
+
15
+ from cubercnn.config import get_cfg_defaults
16
+ from cubercnn.modeling.meta_arch import build_model
17
+ from cubercnn.modeling.backbone import build_dla_from_vision_fpn_backbone # this must be here even though it is not used
18
+
19
+ from cubercnn import util, vis
20
+
21
+
22
+ def do_test(im, threshold, model_str):
23
+ if im is None:
24
+ return None, None
25
+ model = load_model_config()
26
+
27
+ model.eval()
28
+
29
+ thres = threshold
30
+
31
+ min_size = 512
32
+ max_size = 4096
33
+ augmentations = T.AugmentationList([T.ResizeShortestEdge(min_size, max_size, "choice")])
34
+
35
+ category_path = 'configs/category_meta.json'
36
+
37
+ # store locally if needed
38
+ if category_path.startswith(util.CubeRCNNHandler.PREFIX):
39
+ category_path = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, category_path)
40
+
41
+ metadata = util.load_json(category_path)
42
+ cats = metadata['thing_classes']
43
+
44
+ image_shape = im.shape[:2] # h, w
45
+
46
+ h, w = image_shape
47
+
48
+ focal_length_ndc = 4.0
49
+ focal_length = focal_length_ndc * h / 2
50
+
51
+ px, py = w/2, h/2
52
+
53
+ K = np.array([
54
+ [focal_length, 0.0, px],
55
+ [0.0, focal_length, py],
56
+ [0.0, 0.0, 1.0]
57
+ ])
58
+
59
+ # dummy
60
+ aug_input = T.AugInput(im)
61
+ tfms = augmentations(aug_input)
62
+ image = aug_input.image
63
+ # model.to(device)
64
+ batched = [{
65
+ 'image': torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))),
66
+ 'height': image_shape[0], 'width': image_shape[1], 'K': K
67
+ }]
68
+ with torch.no_grad():
69
+ dets = model(batched)[0]['instances']
70
+
71
+ n_det = len(dets)
72
+
73
+ meshes = []
74
+ meshes_text = []
75
+
76
+ if n_det > 0:
77
+ for idx, (corners3D, center_cam, center_2D, dimensions, pose, score, cat_idx) in enumerate(zip(
78
+ dets.pred_bbox3D, dets.pred_center_cam, dets.pred_center_2D, dets.pred_dimensions,
79
+ dets.pred_pose, dets.scores, dets.pred_classes
80
+ )):
81
+
82
+ # skip
83
+ if score < thres:
84
+ continue
85
+
86
+ cat = cats[cat_idx]
87
+
88
+ bbox3D = center_cam.tolist() + dimensions.tolist()
89
+ meshes_text.append('{} {:.2f}'.format(cat, score))
90
+ color = [c/255.0 for c in util.get_color(idx)]
91
+ box_mesh = util.mesh_cuboid(bbox3D, pose.tolist(), color=color)
92
+ meshes.append(box_mesh)
93
+
94
+ # print('File with {} dets'.format(len(meshes)))
95
+
96
+ if len(meshes) > 0:
97
+ im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(im, K, meshes, text=meshes_text, scale=im.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
98
+ im_drawn_rgb, im_topdown = im_drawn_rgb.astype(np.uint8), im_topdown.astype(np.uint8)
99
+ else:
100
+ im_drawn_rgb, im_topdown = im.astype(np.uint8), None
101
+ return im_drawn_rgb, im_topdown
102
+
103
+ def setup(config_file):
104
+ """
105
+ Create configs and perform basic setups.
106
+ """
107
+ cfg = get_cfg()
108
+ get_cfg_defaults(cfg)
109
+
110
+ # store locally if needed
111
+ if config_file.startswith(util.CubeRCNNHandler.PREFIX):
112
+ config_file = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, config_file)
113
+
114
+ cfg.merge_from_file(config_file)
115
+ cfg.freeze()
116
+ return cfg
117
+
118
+ def main(config_file, weigths=None):
119
+ cfg = setup(config_file)
120
+ model = build_model(cfg)
121
+
122
+ DetectionCheckpointer(model).resume_or_load(
123
+ weigths, resume=True
124
+ )
125
+ return cfg, model
126
+
127
+
128
+ if __name__ == "__main__":
129
+ def load_model_config():
130
+ config_file = "configs/Omni_combined.yaml"
131
+ MODEL_WEIGHTS = "output/weak_cube_r-cnn/model_final.pth"
132
+ cfg, model = main(config_file, MODEL_WEIGHTS)
133
+ return model
134
+
135
+ title = 'Weak Cube R-CNN'
136
+ description = "This showcases the different our model [[`Weak Cube RCNN`](https://arxiv.org/abs/2504.13297). To create Weak Cube RCNN, we modify the framework by replacing its 3D loss functions with ones based solely on 2D annotations. Our methods rely heavily on external, strong generalised deep learning models to infer spatial information in scenes. Experimental results show that all models perform comparably to an annotation time-equalised Cube R-CNN, whereof the pseudo ground truth method achieves the highest accuracy. The results show the methods' ability to understand scenes in 3D, providing satisfactory visual results. Although not precise enough for centimetre accurate measurements, the method provide a solid foundation for further research. \n Check out the code on [GitHub](https://github.com/AndreasLH/Weak-Cube-R-CNN)"
137
+
138
+
139
+ demo = gr.Interface(
140
+ title=title,
141
+ fn=do_test,
142
+ inputs=[
143
+ gr.Image(label="Input Image"),
144
+ gr.Slider(0, 1, value=0.25, label="Threshold", info="Only show predictions with a confidence above this threshold"),
145
+ gr.Textbox(value="Weak Cube R-CNN", visible=False, render=False)
146
+ ],
147
+ outputs=[gr.Image(label="Predictions"), gr.Image(label="Top view")],
148
+ description=description,
149
+ allow_flagging='never',
150
+ examples=[["datasets/examples/ex2.jpg"],[],[],["datasets/examples/ex1.jpg"]],
151
+ )
152
+
153
+
154
+ # demo.launch(server_name="0.0.0.0", server_port=7860)
155
+ demo.launch()
configs/Base.yaml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SOLVER:
2
+ TYPE: "sgd"
3
+ IMS_PER_BATCH: 32
4
+ BASE_LR: 0.02
5
+ STEPS: (19200, 25600)
6
+ MAX_ITER: 32000
7
+ WEIGHT_DECAY: 0.0001
8
+ LR_SCHEDULER_NAME: "WarmupMultiStepLR"
9
+ INPUT:
10
+ MIN_SIZE_TRAIN: (256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640,)
11
+ MIN_SIZE_TEST: 512
12
+ MAX_SIZE_TRAIN: 4096
13
+ MAX_SIZE_TEST: 4096
14
+ TEST:
15
+ VISIBILITY_THRES: 0.33333333
16
+ TRUNCATION_THRES: 0.33333333
17
+ EVAL_PERIOD: 16000
18
+ DATASETS:
19
+ TRAIN: ('KITTI_train', 'KITTI_val')
20
+ TEST: ('KITTI_test',)
21
+ CATEGORY_NAMES: ('pedestrian', 'car', 'cyclist', 'van', 'truck', 'tram', 'person')
22
+ IGNORE_NAMES: "['dontcare', 'ignore', 'void']"
23
+ MIN_HEIGHT_THRES: 0.05
24
+ TRUNCATION_THRES: 0.75
25
+ VISIBILITY_THRES: 0.25
26
+ TRUNC_2D_BOXES: True
27
+ VIS_PERIOD: 640
28
+ DATALOADER:
29
+ SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
30
+ REPEAT_THRESHOLD: 0.1
31
+ MODEL:
32
+ PIXEL_MEAN: [103.530, 116.280, 123.675]
33
+ PIXEL_STD: [57.375, 57.120, 58.395]
34
+ META_ARCHITECTURE: "RCNN3D"
35
+ MASK_ON: False
36
+ STABILIZE: 0.02
37
+ USE_BN: True
38
+ BACKBONE:
39
+ FREEZE_AT: 0
40
+ NAME: 'build_dla_from_vision_fpn_backbone'
41
+ DLA:
42
+ TYPE: 'dla34'
43
+ FPN:
44
+ IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
45
+ ANCHOR_GENERATOR:
46
+ SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
47
+ ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
48
+ RPN:
49
+ HEAD_NAME: "StandardRPNHead"
50
+ IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
51
+ PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
52
+ PRE_NMS_TOPK_TEST: 1000 # Per FPN level
53
+ POST_NMS_TOPK_TRAIN: 1000
54
+ POST_NMS_TOPK_TEST: 1000
55
+ BOUNDARY_THRESH: -1
56
+ OBJECTNESS_UNCERTAINTY: "IoUness"
57
+ IOU_THRESHOLDS: [0.05, 0.05]
58
+ POSITIVE_FRACTION: 1.0
59
+ PROPOSAL_GENERATOR:
60
+ NAME: "RPNWithIgnore"
61
+ ROI_HEADS:
62
+ NAME: "ROIHeads3D"
63
+ IN_FEATURES: ["p2", "p3", "p4", "p5", 'p6']
64
+ BATCH_SIZE_PER_IMAGE: 512
65
+ SCORE_THRESH_TEST: 0.01
66
+ NUM_CLASSES: 43
67
+ ROI_BOX_HEAD:
68
+ NAME: "FastRCNNConvFCHead"
69
+ NUM_FC: 2
70
+ POOLER_RESOLUTION: 7
71
+ ROI_CUBE_HEAD:
72
+ NAME: 'CubeHead'
73
+ Z_TYPE: 'direct'
74
+ POSE_TYPE: '6d'
75
+ NUM_FC: 2
76
+ SHARED_FC: True
77
+ USE_CONFIDENCE: 1.0
78
+ LOSS_W_3D: 1.0
79
+ POOLER_TYPE: 'ROIAlignV2'
80
+ POOLER_RESOLUTION: 7
81
+ DISENTANGLED_LOSS: True
82
+ ALLOCENTRIC_POSE: True
83
+ VIRTUAL_FOCAL: 512.0
84
+ VIRTUAL_DEPTH: True
85
+ CHAMFER_POSE: True
86
+ TEST: 'blasss'
87
+ DIMS_PRIORS_ENABLED: True
88
+ DIMS_PRIORS_PRECOMPUTED: False
89
+ VERSION: 2
configs/Base_Omni3D.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "Base.yaml"
2
+ SOLVER:
3
+ TYPE: "sgd"
4
+ IMS_PER_BATCH: 2 #196 -> r=5,6 -> because of dataset size r=5,6 * 10,335/233 = 0,248
5
+ BASE_LR: 0.0214 #0.12
6
+ STEPS: (17280, 23040)
7
+ MAX_ITER: 100000 #116000
8
+ WARMUP_ITERS: 0 #3625
9
+ TEST:
10
+ EVAL_PERIOD: 7200 #29000
11
+ VIS_PERIOD: 1 #2320
12
+ DATASETS:
13
+ TRAIN: ('SUNRGBD_train_mini', 'SUNRGBD_val_mini')
14
+ TEST: ('SUNRGBD_test_mini',)
15
+ CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
16
+ MODEL:
17
+ ROI_HEADS:
18
+ NUM_CLASSES: 50
configs/Base_Omni3D_2D_only.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "Base.yaml"
2
+ SOLVER:
3
+ TYPE: "sgd"
4
+ IMS_PER_BATCH: 6 #196 -> r=5,6 -> because of dataset size r=5,6 * 10,335/233 = 0,248
5
+ BASE_LR: 0.0214 #0.12
6
+ STEPS: (30000, 40000)
7
+ MAX_ITER: 50000 #116000
8
+ WARMUP_ITERS: 0 #3625
9
+ TEST:
10
+ EVAL_PERIOD: 25000 #29000
11
+ VIS_PERIOD: 50000 #2320
12
+ DATASETS:
13
+ TRAIN: ('SUNRGBD_train', 'SUNRGBD_val')
14
+ TEST: ('SUNRGBD_test',)
15
+ CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
16
+ MODEL:
17
+ ROI_HEADS:
18
+ NUM_CLASSES: 50
19
+ ROI_CUBE_HEAD:
20
+ LOSS_W_3D: 0.0
configs/Base_Omni3D_in.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "Base.yaml"
2
+ SOLVER:
3
+ TYPE: "sgd"
4
+ IMS_PER_BATCH: 128
5
+ BASE_LR: 0.08
6
+ STEPS: (69600, 92800)
7
+ MAX_ITER: 116000
8
+ WARMUP_ITERS: 3625
9
+ TEST:
10
+ EVAL_PERIOD: 29000
11
+ VIS_PERIOD: 2320
12
+ DATASETS:
13
+ TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val')
14
+ TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test')
15
+ CATEGORY_NAMES: ('stationery', 'sink', 'table', 'floor mat', 'bottle', 'bookcase', 'bin', 'blinds', 'pillow', 'bicycle', 'refrigerator', 'night stand', 'chair', 'sofa', 'books', 'oven', 'towel', 'cabinet', 'window', 'curtain', 'bathtub', 'laptop', 'desk', 'television', 'clothes', 'stove', 'cup', 'shelves', 'box', 'shoes', 'mirror', 'door', 'picture', 'lamp', 'machine', 'counter', 'bed', 'toilet')
16
+ MODEL:
17
+ ROI_HEADS:
18
+ NUM_CLASSES: 38
configs/Base_Omni3D_og.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "Base.yaml"
2
+ SOLVER:
3
+ TYPE: "sgd"
4
+ IMS_PER_BATCH: 192
5
+ BASE_LR: 0.12
6
+ STEPS: (69600, 92800)
7
+ MAX_ITER: 116000
8
+ WARMUP_ITERS: 3625
9
+ TEST:
10
+ EVAL_PERIOD: 29000
11
+ VIS_PERIOD: 2320
12
+ DATASETS:
13
+ TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
14
+ TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test')
15
+ CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
16
+ MODEL:
17
+ ROI_HEADS:
18
+ NUM_CLASSES: 50
configs/Base_Omni3D_out.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "Base.yaml"
2
+ SOLVER:
3
+ TYPE: "sgd"
4
+ IMS_PER_BATCH: 32
5
+ BASE_LR: 0.02
6
+ STEPS: (69600, 92800)
7
+ MAX_ITER: 116000
8
+ WARMUP_ITERS: 3625
9
+ TEST:
10
+ EVAL_PERIOD: 29000
11
+ VIS_PERIOD: 2320
12
+ DATASETS:
13
+ TRAIN: ('nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
14
+ TEST: ('nuScenes_test', 'KITTI_test')
15
+ CATEGORY_NAMES: ('cyclist', 'pedestrian', 'trailer', 'bus', 'motorcycle', 'car', 'barrier', 'truck', 'van', 'traffic cone', 'bicycle')
16
+ MODEL:
17
+ ROI_HEADS:
18
+ NUM_CLASSES: 11
configs/Base_Omni3D_prof.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "Base.yaml"
2
+ SOLVER:
3
+ TYPE: "sgd"
4
+ IMS_PER_BATCH: 2 #196 -> r=5,6 -> because of dataset size r=5,6 * 10,335/233 = 0,248
5
+ BASE_LR: 0.001224489796 #0.12
6
+ STEPS: (172, 230)
7
+ MAX_ITER: 288 #116000
8
+ WARMUP_ITERS: 9 #3625
9
+ TEST:
10
+ EVAL_PERIOD: 72 #29000
11
+ VIS_PERIOD: 6 #2320
12
+ DATASETS:
13
+ TRAIN: ('SUNRGBD_train', 'SUNRGBD_val')
14
+ TEST: ('SUNRGBD_test',)
15
+ CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
16
+ MODEL:
17
+ ROI_HEADS:
18
+ NUM_CLASSES: 50
configs/Omni_combined.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: "Base.yaml"
2
+ SOLVER:
3
+ TYPE: "sgd"
4
+ IMS_PER_BATCH: 25
5
+ BASE_LR: 0.015
6
+ STEPS: (35000, 40000)
7
+ MAX_ITER: 42001
8
+ WARMUP_ITERS: 0
9
+ CHECKPOINT_PERIOD: 1000
10
+ TEST:
11
+ EVAL_PERIOD: 100000
12
+ VIS_PERIOD: 1000
13
+ DATASETS:
14
+ TRAIN: ('SUNRGBD_train', 'SUNRGBD_val') #, 'KITTI_train_mini', 'KITTI_val_mini')
15
+ TEST: ('SUNRGBD_test',) # 'KITTI_test_mini')
16
+ CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
17
+ MODEL:
18
+ DEVICE: 'cpu'
19
+ DEPTH_ON: False #whether to use the depth anything concated features # if do not use this, then we can use ["p2", "p3", "p4", "p5", "p6"], [[32], [64], [128], [256], [512]], otherwise only ["p2", "p3", "p4", "p5"], [[32], [64], [128], [256]]
20
+ FPN:
21
+ IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
22
+ RPN:
23
+ IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
24
+ ANCHOR_GENERATOR:
25
+ SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
26
+ ROI_HEADS:
27
+ NAME: 'ROIHeads3DScore' # name of the class that is the 3d predictor
28
+ IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
29
+ NUM_CLASSES: 50
30
+ POSITIVE_FRACTION: 0.25 # we can use this to control the ratio of positive to negative sampled cubes in
31
+ ROI_CUBE_HEAD:
32
+ NAME: 'CubeHead' # name of the 3d head
33
+ DIMS_PRIORS_ENABLED: True
34
+ POOLER_TYPE: 'ROIAlignV2'
35
+ POOLER_RESOLUTION: 7
36
+ LOSS_W_3D: 1.0
37
+ META_ARCHITECTURE: 'RCNN3D_combined_features' # name of the overall arch that calls the ROI_HEADS.NAME and ROI_CUBE_HEAD.NAME
configs/category_meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"thing_classes": ["pedestrian", "car", "cyclist", "van", "truck", "traffic cone", "barrier", "motorcycle", "bicycle", "bus", "trailer", "books", "bottle", "camera", "cereal box", "chair", "cup", "laptop", "shoes", "towel", "blinds", "window", "lamp", "shelves", "mirror", "sink", "cabinet", "bathtub", "door", "toilet", "desk", "box", "bookcase", "picture", "table", "counter", "bed", "night stand", "pillow", "sofa", "television", "floor mat", "curtain", "clothes", "stationery", "refrigerator", "bin", "stove", "oven", "machine"], "thing_dataset_id_to_contiguous_id": {"0": 0, "1": 1, "3": 2, "4": 3, "5": 4, "8": 5, "9": 6, "10": 7, "11": 8, "12": 9, "13": 10, "14": 11, "15": 12, "16": 13, "17": 14, "18": 15, "19": 16, "20": 17, "21": 18, "22": 19, "23": 20, "24": 21, "25": 22, "26": 23, "27": 24, "28": 25, "29": 26, "30": 27, "31": 28, "32": 29, "33": 30, "34": 31, "35": 32, "36": 33, "37": 34, "38": 35, "39": 36, "40": 37, "42": 38, "43": 39, "44": 40, "45": 41, "46": 42, "47": 43, "48": 44, "49": 45, "52": 46, "53": 47, "57": 48, "61": 49}}
configs/cubercnn_DLA34_FPN.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ _BASE_: "Base_Omni3D.yaml"
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: 'build_dla_from_vision_fpn_backbone'
5
+ DLA:
6
+ TYPE: 'dla34'
configs/cubercnn_ResNet34_FPN.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ _BASE_: "Base_Omni3D.yaml"
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: 'build_resnet_from_vision_fpn_backbone'
5
+ RESNETS:
6
+ DEPTH: 34
7
+ TORCHVISION: True
configs/cubercnn_densenet_FPN.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _BASE_: "Base_Omni3D.yaml"
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: 'build_densenet_fpn_backbone'
configs/cubercnn_mnasnet_FPN.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _BASE_: "Base_Omni3D.yaml"
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: 'build_mnasnet_fpn_backbone'
configs/cubercnn_shufflenet_FPN.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _BASE_: "Base_Omni3D.yaml"
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: 'build_shufflenet_fpn_backbone'
cubercnn/config/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .config import *
cubercnn/config/config.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ from detectron2.config import CfgNode as CN
3
+
4
+ def get_cfg_defaults(cfg):
5
+
6
+ # A list of category names which will be used
7
+ cfg.DATASETS.CATEGORY_NAMES = []
8
+
9
+ # The category names which will be treated as ignore
10
+ # e.g., not counting as background during training
11
+ # or as false positives during evaluation.
12
+ cfg.DATASETS.IGNORE_NAMES = []
13
+
14
+ # Should the datasets appear with the same probabilty
15
+ # in batches (e.g., the imbalance from small and large
16
+ # datasets will be accounted for during sampling)
17
+ cfg.DATALOADER.BALANCE_DATASETS = False
18
+
19
+ # The thresholds for when to treat a known box
20
+ # as ignore based on too heavy of truncation or
21
+ # too low of visibility in the image. This affects
22
+ # both training and evaluation ignores.
23
+ cfg.DATASETS.TRUNCATION_THRES = 0.99
24
+ cfg.DATASETS.VISIBILITY_THRES = 0.01
25
+ cfg.DATASETS.MIN_HEIGHT_THRES = 0.00
26
+ cfg.DATASETS.MAX_DEPTH = 1e8
27
+
28
+ # Whether modal 2D boxes should be loaded,
29
+ # or if the full 3D projected boxes should be used.
30
+ cfg.DATASETS.MODAL_2D_BOXES = False
31
+
32
+ # Whether truncated 2D boxes should be loaded,
33
+ # or if the 3D full projected boxes should be used.
34
+ cfg.DATASETS.TRUNC_2D_BOXES = True
35
+
36
+ # Threshold used for matching and filtering boxes
37
+ # inside of ignore regions, within the RPN and ROIHeads
38
+ cfg.MODEL.RPN.IGNORE_THRESHOLD = 0.5
39
+
40
+ # Configuration for cube head
41
+ cfg.MODEL.ROI_CUBE_HEAD = CN()
42
+ cfg.MODEL.ROI_CUBE_HEAD.NAME = "CubeHead"
43
+ cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION = 7
44
+ cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO = 0
45
+ cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE = "ROIAlignV2"
46
+
47
+ # Settings for the cube head features
48
+ cfg.MODEL.ROI_CUBE_HEAD.NUM_CONV = 0
49
+ cfg.MODEL.ROI_CUBE_HEAD.CONV_DIM = 256
50
+ cfg.MODEL.ROI_CUBE_HEAD.NUM_FC = 2
51
+ cfg.MODEL.ROI_CUBE_HEAD.FC_DIM = 1024
52
+ # proposal method
53
+ cfg.MODEL.ROI_CUBE_HEAD.NUMBER_OF_PROPOSALS = 1000
54
+
55
+ # the style to predict Z with currently supported
56
+ # options --> ['direct', 'sigmoid', 'log', 'clusters']
57
+ cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE = "direct"
58
+
59
+ # the style to predict pose with currently supported
60
+ # options --> ['6d', 'euler', 'quaternion']
61
+ cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE = "6d"
62
+
63
+ # Whether to scale all 3D losses by inverse depth
64
+ cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT = False
65
+
66
+ # Virtual depth puts all predictions of depth into
67
+ # a shared virtual space with a shared focal length.
68
+ cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH = True
69
+ cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL = 512.0
70
+
71
+ # If true, then all losses are computed using the 8 corners
72
+ # such that they are all in a shared scale space.
73
+ # E.g., their scale correlates with their impact on 3D IoU.
74
+ # This way no manual weights need to be set.
75
+ cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS = True
76
+
77
+ # When > 1, the outputs of the 3D head will be based on
78
+ # a 2D scale clustering, based on 2D proposal height/width.
79
+ # This parameter describes the number of bins to cluster.
80
+ cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS = 1
81
+
82
+ # Whether batch norm is enabled during training.
83
+ # If false, all BN weights will be frozen.
84
+ cfg.MODEL.USE_BN = True
85
+
86
+ # Whether to predict the pose in allocentric space.
87
+ # The allocentric space may correlate better with 2D
88
+ # images compared to egocentric poses.
89
+ cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE = True
90
+
91
+ # Whether to use chamfer distance for disentangled losses
92
+ # of pose. This avoids periodic issues of rotation but
93
+ # may prevent the pose "direction" from being interpretable.
94
+ cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE = True
95
+
96
+ # Should the prediction heads share FC features or not.
97
+ # These include groups of uv, z, whl, pose.
98
+ cfg.MODEL.ROI_CUBE_HEAD.SHARED_FC = True
99
+
100
+ # Check for stable gradients. When inf is detected, skip the update.
101
+ # This prevents an occasional bad sample from exploding the model.
102
+ # The threshold below is the allows percent of bad samples.
103
+ # 0.0 is off, and 0.01 is recommended for minor robustness to exploding.
104
+ cfg.MODEL.STABILIZE = 0.01
105
+
106
+ # Whether or not to use the dimension priors
107
+ cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED = True
108
+
109
+ # How prior dimensions should be computed?
110
+ # The supported modes are ["exp", "sigmoid"]
111
+ # where exp is unbounded and sigmoid is bounded
112
+ # between +- 3 standard deviations from the mean.
113
+ cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC = 'exp'
114
+
115
+ # weight for confidence loss. 0 is off.
116
+ cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE = 1.0
117
+
118
+ # Loss weights for XY, Z, Dims, Pose
119
+ cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D = 1.0
120
+ cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_XY = 1.0
121
+ cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE = 7.0
122
+ cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_NORMAL_VEC = 20.0
123
+ cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_IOU = 1.0
124
+ cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_SEG = 2.5
125
+ cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z = 1.0
126
+ cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS = 20.0
127
+ cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DEPTH = 1.0
128
+
129
+ cfg.MODEL.DLA = CN()
130
+
131
+ # Supported types for DLA backbones are...
132
+ # dla34, dla46_c, dla46x_c, dla60x_c, dla60, dla60x, dla102x, dla102x2, dla169
133
+ cfg.MODEL.DLA.TYPE = 'dla34'
134
+
135
+ # Only available for dla34, dla60, dla102
136
+ cfg.MODEL.DLA.TRICKS = False
137
+
138
+ # A joint loss for the disentangled loss.
139
+ # All predictions are computed using a corner
140
+ # or chamfers loss depending on chamfer_pose!
141
+ # Recommened to keep this weight small: [0.05, 0.5]
142
+ cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_JOINT = 1.0
143
+
144
+ # sgd, adam, adam+amsgrad, adamw, adamw+amsgrad
145
+ cfg.SOLVER.TYPE = 'sgd'
146
+
147
+ cfg.MODEL.RESNETS.TORCHVISION = True
148
+ cfg.TEST.DETECTIONS_PER_IMAGE = 100
149
+
150
+ cfg.TEST.VISIBILITY_THRES = 1/2.0
151
+ cfg.TEST.TRUNCATION_THRES = 1/2.0
152
+
153
+ cfg.INPUT.RANDOM_FLIP = "horizontal"
154
+
155
+ # When True, we will use localization uncertainty
156
+ # as the new IoUness score in the RPN.
157
+ cfg.MODEL.RPN.OBJECTNESS_UNCERTAINTY = 'IoUness'
158
+
159
+ # If > 0.0 this is the scaling factor that will be applied to
160
+ # an RoI 2D box before doing any pooling to give more context.
161
+ # Ex. 1.5 makes width and height 50% larger.
162
+ cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES = 0.0
163
+
164
+ # weight path specifically for pretraining (no checkpointables will be loaded)
165
+ cfg.MODEL.WEIGHTS_PRETRAIN = ''
166
+
167
+ # ## start of our things
168
+ cfg.MODEL.ROI_CUBE_HEAD.TEST = 'bas'
169
+ cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_PRECOMPUTED = False
170
+
171
+ cfg.PLOT = CN(new_allowed=True)
172
+ cfg.PLOT.OUTPUT_DIR = ''
173
+ cfg.PLOT.EVAL = ''
174
+ cfg.PLOT.MODE2D = '' #either GT or PRED
175
+
176
+ cfg.PLOT.SCORING_FUNC = None
177
+ cfg.PLOT.PROPOSAL_FUNC = None
178
+ cfg.PLOT.number_of_proposals = 1000
179
+
180
+ cfg.TRAIN = CN(new_allowed=True)
181
+ cfg.TRAIN.pseudo_gt = 'learn'
182
+
183
+ # these are meant to be overwritten as an argument
184
+ cfg.log = True
185
+ # (these 2 are mutually exclusive) z_pseudo_gt_patch or z_pseudo_gt_center
186
+ cfg.loss_functions = ['iou']
187
+ cfg.MODEL.DEPTH_ON = False #whether to use the depth anything concated features
cubercnn/data/Omni_to_kitti.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from detectron2.data.catalog import MetadataCatalog
3
+ from cubercnn import data
4
+ from detectron2.structures import Boxes, BoxMode
5
+ from cubercnn.util.math_util import estimate_truncation, mat2euler, R_to_allocentric
6
+ import os
7
+ import numpy as np
8
+ from tqdm import tqdm
9
+
10
+ def perp_vector(a, b):
11
+ return np.array([b, -a])
12
+
13
+ def rotate_vector(x, y, theta):
14
+ # Calculate the rotated coordinates
15
+ x_rotated = x * np.cos(theta) - y * np.sin(theta)
16
+ y_rotated = x * np.sin(theta) + y * np.cos(theta)
17
+
18
+ return np.array([x_rotated, y_rotated])
19
+
20
+ def calculate_alpha(location, ry):
21
+ '''
22
+ location: x, y, z coordinates
23
+ ry: rotation around y-axis, negative counter-clockwise,
24
+
25
+ positive x-axis is to the right
26
+ calculate the angle from a line perpendicular to the camera to the center of the bounding box'''
27
+
28
+ # get vector from camera to object
29
+ ry = -ry
30
+ x, y, z = location
31
+ # vector from [0,0,0] to the center of the bounding box
32
+ # we can do the whole thing in 2D, top down view
33
+ # vector perpendicular to center
34
+ perpendicular = perp_vector(x,z)
35
+ # vector corresponding to ry
36
+ ry_vector = np.array([np.cos(ry), np.sin(ry)])
37
+ # angle between perpendicular and ry_vector
38
+ dot = perpendicular[0]*ry_vector[0] + perpendicular[1]*ry_vector[1] # Dot product between [x1, y1] and [x2, y2]
39
+ det = perpendicular[0]*ry_vector[1] - perpendicular[1]*ry_vector[0] # Determinant
40
+ alpha = -np.arctan2(det, dot)
41
+
42
+ # wrap to -pi to pi
43
+ if alpha > np.pi:
44
+ alpha -= 2*np.pi
45
+ if alpha < -np.pi:
46
+ alpha += 2*np.pi
47
+ return alpha
48
+
49
+ def test_calculate_alpha():
50
+ location = [-3.67, 1.67, 6.05]
51
+ ry = -1.24
52
+ expected = -0.72
53
+ result1 = calculate_alpha(location, ry)
54
+
55
+ location = [-9.48, 2.08, 26.41]
56
+ ry = 1.77
57
+ expected = 2.11
58
+ result2 = calculate_alpha(location, ry)
59
+
60
+ location = [4.19, 1.46, 44.41]
61
+ ry = -1.35
62
+ expected = -1.45
63
+ result3 = calculate_alpha(location, ry)
64
+
65
+ location = [-6.41, 2.04, 46.74]
66
+ ry = 1.68
67
+ expected = 1.82
68
+ result4 = calculate_alpha(location, ry)
69
+
70
+ location = [0.28, 2.08, 17.74]
71
+ ry = -1.58
72
+ expected = -1.59
73
+ result5 = calculate_alpha(location, ry)
74
+
75
+ location = [-3.21, 1.97, 11.22]
76
+ ry = -0.13
77
+ expected = 0.15
78
+ result6 = calculate_alpha(location, ry)
79
+
80
+ # assert np.isclose(result, expected, atol=0.01)
81
+ return result1
82
+
83
+
84
+ def main():
85
+ alpha = test_calculate_alpha()
86
+
87
+
88
+ name = 'KITTI'
89
+ split = 'test'
90
+ dataset_paths_to_json = [f'datasets/Omni3D/{name}_{split}.json',]
91
+ os.makedirs('output/KITTI_formatted_predictions', exist_ok=True)
92
+
93
+ # Example 1. load all images
94
+ dataset = data.Omni3D(dataset_paths_to_json)
95
+ imgIds = dataset.getImgIds()
96
+ imgs = dataset.loadImgs(imgIds)
97
+
98
+ # Example 2. load annotations for image index 0
99
+ annIds = dataset.getAnnIds(imgIds=imgs[0]['id'])
100
+ anns = dataset.loadAnns(annIds)
101
+
102
+ data.register_and_store_model_metadata(dataset, 'output')
103
+
104
+ thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
105
+ dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
106
+ cats = {'pedestrian', 'car', 'cyclist', 'van', 'truck'}
107
+
108
+ input_folder = 'kitti_omni_eq'
109
+
110
+ out_path = 'output/'+input_folder+'/KITTI_formatted_predictions/'
111
+ in_path = 'output/'+input_folder+'/KITTI_pred/instances_predictions.pth'
112
+ print('saving to', out_path)
113
+ data_json = torch.load(in_path)
114
+ #
115
+ # reference
116
+ # https://github.com/ZrrSkywalker/MonoDETR/blob/c724572bddbc067832a0e0d860a411003f36c2fa/lib/helpers/tester_helper.py#L114
117
+ files = {}
118
+ for image in tqdm(data_json):
119
+ K = image['K']
120
+ K_inv = np.linalg.inv(K)
121
+ width, height = image['width'], image['height']
122
+ image_id = image['image_id']
123
+ l = []
124
+ for pred in image['instances']:
125
+
126
+ category = thing_classes[pred['category_id']]
127
+ if category not in cats:
128
+ continue
129
+ occluded = 0
130
+ # truncation = estimate_truncation(K, torch.tensor([x3d, y3d, z3d, w3d, h3d, l3d]), pred['pose'], width, height)
131
+ truncation = 0.0 # it does not matter
132
+ rotation_y = mat2euler(np.array(pred['pose']))[1]
133
+ bbox = BoxMode.convert(pred['bbox'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) # x1, y1, x2, y2 -> convert to left, top, right, bottom
134
+ h3d, w3d, l3d = pred['dimensions']
135
+ # unproject, this should yield the same
136
+ # cen_2d = np.array(pred['center_2D'] + [1])
137
+ # z3d = pred['center_cam'][2]
138
+ # x3d, y3d, z3d = (K_inv @ (z3d*cen_2d))
139
+
140
+ x3d, y3d, z3d = pred['center_cam']
141
+
142
+ location = pred['center_cam']
143
+ score = pred['score']
144
+ alpha = calculate_alpha(location, rotation_y)
145
+
146
+ # convert to KITTI format
147
+ li = [category, truncation, occluded, alpha, bbox[0], bbox[1], bbox[2], bbox[3], h3d, w3d, l3d, x3d, y3d, z3d, rotation_y, score]
148
+ l.append(li)
149
+ # sort l by z3d
150
+ l = sorted(l, key=lambda x: x[13])
151
+ files[image_id] = l
152
+
153
+ # 7518 test images
154
+ os.makedirs(out_path, exist_ok=True)
155
+ for img_id, content in files.items():
156
+
157
+ img_id_str = str(img_id).zfill(6)
158
+ with open(out_path+f'{img_id_str}.txt', 'w') as f:
159
+ str_i = ''
160
+ for i in content:
161
+ # t = f'{category} {truncation:.2f} {occluded} {alpha:.2f} {bbox[0]:.2f} {bbox[1]:.2f} {bbox[2]:.2f} {bbox[3]:.2f} {w3d:.2f} {h3d:.2f} {l3d:.2f} {x3d:.2f} {y3d:.2f} {z3d:.2f} {rotation_y:.2f} {score:.2f}\n'
162
+ t = f'{i[0][0].upper() + i[0][1:]} {i[1]:.2f} {i[2]} {i[3]:.2f} {i[4]:.2f} {i[5]:.2f} {i[6]:.2f} {i[7]:.2f} {i[8]:.2f} {i[9]:.2f} {i[10]:.2f} {i[11]:.2f} {i[12]:.2f} {i[13]:.2f} {i[14]:.2f} {i[15]:.2f}\n'
163
+ str_i += t
164
+ f.write(str_i)
165
+
166
+ if __name__ == '__main__':
167
+ main()
168
+
169
+ # write to file
170
+ # #Values Name Description
171
+ # ----------------------------------------------------------------------------
172
+ # 1 type Describes the type of object: 'Car', 'Van', 'Truck',
173
+ # 'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
174
+ # 'Misc' or 'DontCare'
175
+ # 1 truncated Float from 0 (non-truncated) to 1 (truncated), where
176
+ # truncated refers to the object leaving image boundaries
177
+ # 1 occluded Integer (0,1,2,3) indicating occlusion state:
178
+ # 0 = fully visible, 1 = partly occluded
179
+ # 2 = largely occluded, 3 = unknown
180
+ # 1 alpha Observation angle of object, ranging [-pi..pi]
181
+ # 4 bbox 2D bounding box of object in the image (0-based index):
182
+ # contains left, top, right, bottom pixel coordinates
183
+ # 3 dimensions 3D object dimensions: height, width, length (in meters)
184
+ # 3 location 3D object location x,y,z in camera coordinates (in meters)
185
+ # 1 rotation_y Rotation ry around Y-axis in camera coordinates [-pi..pi]
186
+ # 1 score Only for results: Float, indicating confidence in
187
+ # detection, needuhued for p/r curves, higher is better.
188
+
189
+ # output to files 000000.txt 000001.txt ...
190
+
191
+ # example file
192
+ # Car 0.00 0 -1.56 564.62 174.59 616.43 224.74 1.61 1.66 3.20 -0.69 1.69 25.01 -1.59
193
+ # Car 0.00 0 1.71 481.59 180.09 512.55 202.42 1.40 1.51 3.70 -7.43 1.88 47.55 1.55
194
+ # Car 0.00 0 1.64 542.05 175.55 565.27 193.79 1.46 1.66 4.05 -4.71 1.71 60.52 1.56
195
+ # Cyclist 0.00 0 1.89 330.60 176.09 355.61 213.60 1.72 0.50 1.95 -12.63 1.88 34.09 1.54
196
+ # DontCare -1 -1 -10 753.33 164.32 798.00 186.74 -1 -1 -1 -1000 -1000 -1000 -10
197
+ # DontCare -1 -1 -10 738.50 171.32 753.27 184.42 -1 -1 -1 -1000 -1000 -1000 -10
cubercnn/data/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .datasets import *
2
+ from .dataset_mapper import *
3
+ from .build import *
4
+ from .builtin import *
5
+ from .Omni_to_kitti import *
cubercnn/data/build.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ import itertools
3
+ import logging
4
+ import numpy as np
5
+ import math
6
+ from collections import defaultdict
7
+ import torch.utils.data
8
+
9
+ from detectron2.config import configurable
10
+ from detectron2.utils.logger import _log_api_usage
11
+
12
+ from detectron2.data.catalog import DatasetCatalog
13
+ from detectron2.data.common import DatasetFromList, MapDataset
14
+ from detectron2.data.dataset_mapper import DatasetMapper
15
+ from detectron2.data.samplers import (
16
+ InferenceSampler,
17
+ RepeatFactorTrainingSampler,
18
+ TrainingSampler
19
+ )
20
+ from detectron2.data.build import (
21
+ build_batch_data_loader,
22
+ trivial_batch_collator
23
+ )
24
+
25
+ def filter_images_with_only_crowd_annotations(dataset_dicts):
26
+ """
27
+ Filter out images with none annotations or only crowd annotations
28
+ (i.e., images without non-crowd annotations).
29
+ A common training-time preprocessing on COCO dataset.
30
+
31
+ Args:
32
+ dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
33
+
34
+ Returns:
35
+ list[dict]: the same format, but filtered.
36
+ """
37
+ num_before = len(dataset_dicts)
38
+
39
+ def valid(anns):
40
+ for ann in anns:
41
+ if ann.get("iscrowd", 0) == 0:
42
+ return True
43
+ return False
44
+
45
+ dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
46
+ num_after = len(dataset_dicts)
47
+ logger = logging.getLogger(__name__)
48
+ logger.info(
49
+ "Removed {} images marked with crowd. {} images left.".format(
50
+ num_before - num_after, num_after
51
+ )
52
+ )
53
+ return dataset_dicts
54
+
55
+ def get_detection_dataset_dicts(names, filter_empty=True, **kwargs):
56
+
57
+ if isinstance(names, str):
58
+ names = [names]
59
+
60
+ assert len(names), names
61
+ dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names]
62
+ for dataset_name, dicts in zip(names, dataset_dicts):
63
+ assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
64
+
65
+ dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
66
+
67
+ has_instances = "annotations" in dataset_dicts[0]
68
+
69
+ if filter_empty and has_instances:
70
+ dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
71
+
72
+ assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
73
+ return dataset_dicts
74
+
75
+
76
+ def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None, dataset_id_to_src=None):
77
+ if dataset is None:
78
+ dataset = get_detection_dataset_dicts(
79
+ cfg.DATASETS.TRAIN,
80
+ filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
81
+ min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
82
+ if cfg.MODEL.KEYPOINT_ON
83
+ else 0,
84
+ proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
85
+ )
86
+ _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
87
+
88
+ if mapper is None:
89
+ mapper = DatasetMapper(cfg, True)
90
+
91
+ if sampler is None:
92
+ sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
93
+ balance_datasets = cfg.DATALOADER.BALANCE_DATASETS
94
+ logger = logging.getLogger(__name__)
95
+ logger.info("Using training sampler {}".format(sampler_name))
96
+
97
+ if balance_datasets:
98
+ assert dataset_id_to_src is not None, 'Need dataset sources.'
99
+
100
+ dataset_source_to_int = {val:i for i, val in enumerate(set(dataset_id_to_src.values()))}
101
+ dataset_ids_per_img = [dataset_source_to_int[dataset_id_to_src[img['dataset_id']]] for img in dataset]
102
+ dataset_ids = np.unique(dataset_ids_per_img)
103
+
104
+ # only one source? don't re-weight then.
105
+ if len(dataset_ids) == 1:
106
+ weights_per_img = torch.ones(len(dataset_ids_per_img)).float()
107
+
108
+ # compute per-dataset weights.
109
+ else:
110
+ counts = np.bincount(dataset_ids_per_img)
111
+ counts = [counts[id] for id in dataset_ids]
112
+ weights = [1 - count/np.sum(counts) for count in counts]
113
+ weights = [weight/np.min(weights) for weight in weights]
114
+
115
+ weights_per_img = torch.zeros(len(dataset_ids_per_img)).float()
116
+ dataset_ids_per_img = torch.FloatTensor(dataset_ids_per_img).long()
117
+
118
+ # copy weights
119
+ for dataset_id, weight in zip(dataset_ids, weights):
120
+ weights_per_img[dataset_ids_per_img == dataset_id] = weight
121
+
122
+ # no special sampling whatsoever
123
+ if sampler_name == "TrainingSampler" and not balance_datasets:
124
+ sampler = TrainingSampler(len(dataset))
125
+
126
+ # balance the weight sampling by datasets
127
+ elif sampler_name == "TrainingSampler" and balance_datasets:
128
+ sampler = RepeatFactorTrainingSampler(weights_per_img)
129
+
130
+ # balance the weight sampling by categories
131
+ elif sampler_name == "RepeatFactorTrainingSampler" and not balance_datasets:
132
+ repeat_factors = repeat_factors_from_category_frequency(
133
+ dataset, cfg.DATALOADER.REPEAT_THRESHOLD
134
+ )
135
+ sampler = RepeatFactorTrainingSampler(repeat_factors)
136
+
137
+ # balance the weight sampling by categories AND by dataset frequency
138
+ elif sampler_name == "RepeatFactorTrainingSampler" and balance_datasets:
139
+ repeat_factors = repeat_factors_from_category_frequency(
140
+ dataset, cfg.DATALOADER.REPEAT_THRESHOLD
141
+ )
142
+ repeat_factors *= weights_per_img
143
+ repeat_factors /= repeat_factors.min().item()
144
+ sampler = RepeatFactorTrainingSampler(repeat_factors)
145
+ else:
146
+ raise ValueError("Unknown training sampler: {}".format(sampler_name))
147
+
148
+ return {
149
+ "dataset": dataset,
150
+ "sampler": sampler,
151
+ "mapper": mapper,
152
+ "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
153
+ "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
154
+ "num_workers": cfg.DATALOADER.NUM_WORKERS,
155
+ }
156
+
157
+
158
+ def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh):
159
+ """
160
+ Compute (fractional) per-image repeat factors based on category frequency.
161
+ The repeat factor for an image is a function of the frequency of the rarest
162
+ category labeled in that image. The "frequency of category c" in [0, 1] is defined
163
+ as the fraction of images in the training set (without repeats) in which category c
164
+ appears.
165
+ See :paper:`lvis` (>= v2) Appendix B.2.
166
+
167
+ Args:
168
+ dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
169
+ repeat_thresh (float): frequency threshold below which data is repeated.
170
+ If the frequency is half of `repeat_thresh`, the image will be
171
+ repeated twice.
172
+
173
+ Returns:
174
+ torch.Tensor:
175
+ the i-th element is the repeat factor for the dataset image at index i.
176
+ """
177
+ # 1. For each category c, compute the fraction of images that contain it: f(c)
178
+ category_freq = defaultdict(int)
179
+ for dataset_dict in dataset_dicts: # For each image (without repeats)
180
+ cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
181
+ for cat_id in cat_ids:
182
+ if cat_id < 0: continue
183
+ category_freq[cat_id] += 1
184
+ num_images = len(dataset_dicts)
185
+ for k, v in category_freq.items():
186
+ category_freq[k] = v / num_images
187
+
188
+ # 2. For each category c, compute the category-level repeat factor:
189
+ # r(c) = max(1, sqrt(t / f(c)))
190
+ category_rep = {
191
+ cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
192
+ for cat_id, cat_freq in category_freq.items()
193
+ }
194
+
195
+ # 3. For each image I, compute the image-level repeat factor:
196
+ # r(I) = max_{c in I} r(c)
197
+ rep_factors = []
198
+ for dataset_dict in dataset_dicts:
199
+ cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
200
+ rep_factor = max({category_rep[cat_id] for cat_id in cat_ids if cat_id >= 0}, default=1.0)
201
+ rep_factors.append(rep_factor)
202
+
203
+ return torch.tensor(rep_factors, dtype=torch.float32)
204
+
205
+ @configurable(from_config=_train_loader_from_config)
206
+ def build_detection_train_loader(dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0):
207
+ if isinstance(dataset, list):
208
+ dataset = DatasetFromList(dataset, copy=False)
209
+ if mapper is not None:
210
+ dataset = MapDataset(dataset, mapper)
211
+ if sampler is None:
212
+ sampler = TrainingSampler(len(dataset))
213
+ assert isinstance(sampler, torch.utils.data.Sampler)
214
+ return build_batch_data_loader(
215
+ dataset,
216
+ sampler,
217
+ total_batch_size,
218
+ aspect_ratio_grouping=aspect_ratio_grouping,
219
+ num_workers=num_workers
220
+ )
221
+
222
+ def _test_loader_from_config(cfg, dataset_name, batch_size=1, mapper=None, filter_empty=False):
223
+ if isinstance(dataset_name, str):
224
+ dataset_name = [dataset_name]
225
+
226
+ dataset = get_detection_dataset_dicts(
227
+ dataset_name,
228
+ filter_empty=filter_empty,
229
+ proposal_files=[
230
+ cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
231
+ ]
232
+ if cfg.MODEL.LOAD_PROPOSALS
233
+ else None,
234
+ )
235
+ if mapper is None:
236
+ mapper = DatasetMapper(cfg, False)
237
+
238
+ return {"dataset": dataset, "mapper": mapper, 'batch_size':batch_size, "num_workers": cfg.DATALOADER.NUM_WORKERS}
239
+
240
+ @configurable(from_config=_test_loader_from_config)
241
+ def build_detection_test_loader(dataset, *, mapper, batch_size=1, sampler=None, num_workers=0):
242
+
243
+ if isinstance(dataset, list):
244
+ dataset = DatasetFromList(dataset, copy=False)
245
+ if mapper is not None:
246
+ dataset = MapDataset(dataset, mapper)
247
+ if sampler is None:
248
+ sampler = InferenceSampler(len(dataset))
249
+
250
+ # Always use 1 image per worker during inference since this is the
251
+ # standard when reporting inference time in papers.
252
+ batch_sampler = torch.utils.data.BatchSampler(sampler, batch_size=batch_size, drop_last=False)
253
+ data_loader = torch.utils.data.DataLoader(
254
+ dataset,
255
+ num_workers=num_workers,
256
+ batch_sampler=batch_sampler,
257
+ collate_fn=trivial_batch_collator,
258
+ )
259
+ return data_loader
260
+
cubercnn/data/builtin.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+
3
+ def get_omni3d_categories(dataset="omni3d"):
4
+ """
5
+ Returns the Omni3D categories for dataset
6
+ Args:
7
+ dataset: str
8
+ Returns:
9
+ cats: set of strings with category names
10
+ """
11
+
12
+ if dataset == "omni3d":
13
+ cats = set({'chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin'})
14
+ assert len(cats) == 50
15
+ elif dataset == "omni3d_in":
16
+ cats = set({'stationery', 'sink', 'table', 'floor mat', 'bottle', 'bookcase', 'bin', 'blinds', 'pillow', 'bicycle', 'refrigerator', 'night stand', 'chair', 'sofa', 'books', 'oven', 'towel', 'cabinet', 'window', 'curtain', 'bathtub', 'laptop', 'desk', 'television', 'clothes', 'stove', 'cup', 'shelves', 'box', 'shoes', 'mirror', 'door', 'picture', 'lamp', 'machine', 'counter', 'bed', 'toilet'})
17
+ assert len(cats) == 38
18
+ elif dataset == "omni3d_out":
19
+ cats = set({'cyclist', 'pedestrian', 'trailer', 'bus', 'motorcycle', 'car', 'barrier', 'truck', 'van', 'traffic cone', 'bicycle'})
20
+ assert len(cats) == 11
21
+ elif dataset in ["SUNRGBD_train", "SUNRGBD_val", "SUNRGBD_test", "SUNRGBD_train_mini", "SUNRGBD_val_mini", "SUNRGBD_test_mini", "SUNRGBD_test_mini2", "SUNRGBD_test_mini500"]:
22
+ cats = set({'bicycle', 'books', 'bottle', 'chair', 'cup', 'laptop', 'shoes', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator', 'bin', 'stove', 'oven', 'machine'})
23
+ assert len(cats) == 38
24
+ elif dataset in ["Hypersim_train", "Hypersim_val"]:
25
+ cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'})
26
+ assert len(cats) == 29
27
+ elif dataset == "Hypersim_test":
28
+ # Hypersim test annotation does not contain toilet
29
+ cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'})
30
+ assert len(cats) == 28
31
+ elif dataset in ["ARKitScenes_train", "ARKitScenes_val", "ARKitScenes_test"]:
32
+ cats = set({'table', 'bed', 'sofa', 'television', 'refrigerator', 'chair', 'oven', 'machine', 'stove', 'shelves', 'sink', 'cabinet', 'bathtub', 'toilet'})
33
+ assert len(cats) == 14
34
+ elif dataset in ["Objectron_train", "Objectron_val", "Objectron_test"]:
35
+ cats = set({'bicycle', 'books', 'bottle', 'camera', 'cereal box', 'chair', 'cup', 'laptop', 'shoes'})
36
+ assert len(cats) == 9
37
+ elif dataset in ["KITTI_train", "KITTI_val", "KITTI_test"]:
38
+ cats = set({'pedestrian', 'car', 'cyclist', 'van', 'truck'})
39
+ assert len(cats) == 5
40
+ elif dataset in ["nuScenes_train", "nuScenes_val", "nuScenes_test"]:
41
+ cats = set({'pedestrian', 'car', 'truck', 'traffic cone', 'barrier', 'motorcycle', 'bicycle', 'bus', 'trailer'})
42
+ assert len(cats) == 9
43
+ else:
44
+ raise ValueError("%s dataset is not registered." % (dataset))
45
+
46
+ return cats
cubercnn/data/dataset_mapper.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ import copy
3
+ import logging
4
+ from detectron2.config.config import configurable
5
+ from detectron2.data.transforms.augmentation import AugmentationList
6
+ import torch
7
+ import numpy as np
8
+ from detectron2.structures import BoxMode, Keypoints
9
+ from detectron2.data import detection_utils
10
+ from detectron2.data import transforms as T
11
+ from detectron2.data import (
12
+ DatasetMapper
13
+ )
14
+ from detectron2.structures import (
15
+ Boxes,
16
+ BoxMode,
17
+ Instances,
18
+ )
19
+
20
+ from typing import List, Optional, Union
21
+
22
+ from PIL import Image
23
+
24
+ class DatasetMapper3D(DatasetMapper):
25
+
26
+ @configurable
27
+ def __init__(
28
+ self,
29
+ is_train: bool,
30
+ *,
31
+ augmentations: List[Union[T.Augmentation, T.Transform]],
32
+ image_format: str,
33
+ mode:str=None,
34
+ use_instance_mask: bool = False,
35
+ use_keypoint: bool = False,
36
+ instance_mask_format: str = "polygon",
37
+ keypoint_hflip_indices: Optional[np.ndarray] = None,
38
+ precomputed_proposal_topk: Optional[int] = None,
39
+ recompute_boxes: bool = False,
40
+ only_2d: bool = False,
41
+ ):
42
+ """
43
+ NOTE: this interface is experimental.
44
+
45
+ Args:
46
+ is_train: whether it's used in training or inference
47
+ mode: 'get_depth_maps' (default), 'cube_rcnn'
48
+ augmentations: a list of augmentations or deterministic transforms to apply
49
+ image_format: an image format supported by :func:`detection_utils.read_image`.
50
+ use_instance_mask: whether to process instance segmentation annotations, if available
51
+ use_keypoint: whether to process keypoint annotations if available
52
+ instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation
53
+ masks into this format.
54
+ keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices`
55
+ precomputed_proposal_topk: if given, will load pre-computed
56
+ proposals from dataset_dict and keep the top k proposals for each image.
57
+ recompute_boxes: whether to overwrite bounding box annotations
58
+ by computing tight bounding boxes from instance mask annotations.
59
+ """
60
+ if recompute_boxes:
61
+ assert use_instance_mask, "recompute_boxes requires instance masks"
62
+ # fmt: off
63
+ self.is_train = is_train
64
+ self.augmentations = T.AugmentationList(augmentations)
65
+ self.image_format = image_format
66
+ self.use_instance_mask = use_instance_mask
67
+ self.instance_mask_format = instance_mask_format
68
+ self.use_keypoint = use_keypoint
69
+ self.keypoint_hflip_indices = keypoint_hflip_indices
70
+ self.proposal_topk = precomputed_proposal_topk
71
+ self.recompute_boxes = recompute_boxes
72
+ self.only_2d = only_2d
73
+ self.mode = mode
74
+ # fmt: on
75
+ logger = logging.getLogger(__name__)
76
+ mode_out = "training" if is_train else "inference"
77
+ logger.info(f"[DatasetMapper] Augmentations used in {mode_out}: {augmentations}")
78
+
79
+ @classmethod
80
+ def from_config(cls, cfg, is_train: bool = True, mode='get_depth_maps'):
81
+ augs = detection_utils.build_augmentation(cfg, is_train)
82
+ if cfg.INPUT.CROP.ENABLED and is_train:
83
+ augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
84
+ recompute_boxes = cfg.MODEL.MASK_ON
85
+ else:
86
+ recompute_boxes = False
87
+
88
+ ret = {
89
+ "is_train": is_train,
90
+ "mode": mode,
91
+ "augmentations": augs,
92
+ "image_format": cfg.INPUT.FORMAT,
93
+ "use_instance_mask": cfg.MODEL.MASK_ON,
94
+ "instance_mask_format": cfg.INPUT.MASK_FORMAT,
95
+ "use_keypoint": cfg.MODEL.KEYPOINT_ON,
96
+ "recompute_boxes": recompute_boxes,
97
+ "only_2d": cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D == 0.0,
98
+ }
99
+
100
+ if cfg.MODEL.KEYPOINT_ON:
101
+ ret["keypoint_hflip_indices"] = detection_utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
102
+
103
+ if cfg.MODEL.LOAD_PROPOSALS:
104
+ ret["precomputed_proposal_topk"] = (
105
+ cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
106
+ if is_train
107
+ else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
108
+ )
109
+ return ret
110
+
111
+ def __call__(self, dataset_dict):
112
+
113
+ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
114
+
115
+ image = detection_utils.read_image(dataset_dict["file_name"], format=self.image_format)
116
+ detection_utils.check_image_size(dataset_dict, image)
117
+
118
+ aug_input = T.AugInput(image)
119
+ # state = torch.get_rng_state()
120
+ transforms = self.augmentations(aug_input)
121
+ image = aug_input.image
122
+ image_shape = image.shape[:2] # h, w
123
+
124
+ # dont load ground map and depth map when
125
+ if not self.only_2d:
126
+ if 'depth_image_path' in dataset_dict:
127
+ dp_img = Image.fromarray(np.load(dataset_dict["depth_image_path"])['depth'])
128
+ dp_img = np.array(dp_img.resize(image.shape[:2][::-1], Image.NEAREST))
129
+ aug_input_dp = T.AugInput(dp_img)
130
+ aug_only_flip = AugmentationList(transforms[-1:])
131
+ # torch.set_rng_state(state)
132
+ #transforms_dp = aug_only_flip(aug_input_dp)
133
+ dp_image = aug_input_dp.image
134
+ dataset_dict["depth_map"] = torch.as_tensor(np.ascontiguousarray(dp_image))
135
+ else:
136
+ dataset_dict["depth_map"] = None
137
+
138
+ # ground image
139
+ if 'ground_image_path' in dataset_dict:
140
+ ground_img = Image.fromarray(np.load(dataset_dict["ground_image_path"])['mask'])
141
+ ground_img = np.array(ground_img.resize(image.shape[:2][::-1], Image.NEAREST))
142
+ aug_input_gr = T.AugInput(ground_img)
143
+ #transforms_gr = aug_only_flip(aug_input_gr)
144
+ gr_image = aug_input_gr.image
145
+ dataset_dict["ground_map"] = torch.as_tensor(np.ascontiguousarray(gr_image))
146
+ else:
147
+ dataset_dict["ground_map"] = None
148
+
149
+ # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
150
+ # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
151
+ # Therefore it's important to use torch.Tensor.
152
+ dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
153
+
154
+ # no need for additional processing at inference
155
+ if not self.is_train:
156
+ return dataset_dict
157
+
158
+ if "annotations" in dataset_dict:
159
+
160
+ dataset_id = dataset_dict['dataset_id']
161
+ K = np.array(dataset_dict['K'])
162
+
163
+ unknown_categories = self.dataset_id_to_unknown_cats[dataset_id]
164
+
165
+ # transform and pop off annotations
166
+ annos = [
167
+ transform_instance_annotations(obj, transforms, K=K)
168
+ for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0
169
+ ]
170
+
171
+ # convert to instance format
172
+ instances = annotations_to_instances(annos, image_shape, unknown_categories)
173
+ dataset_dict["instances"] = detection_utils.filter_empty_instances(instances)
174
+
175
+ return dataset_dict
176
+
177
+ '''
178
+ Cached for mirroring annotations
179
+ '''
180
+ _M1 = np.array([
181
+ [1, 0, 0],
182
+ [0, -1, 0],
183
+ [0, 0, -1]
184
+ ])
185
+ _M2 = np.array([
186
+ [-1., 0., 0.],
187
+ [ 0., -1., 0.],
188
+ [ 0., 0., 1.]
189
+ ])
190
+
191
+
192
+ def transform_instance_annotations(annotation, transforms, *, K):
193
+
194
+ if isinstance(transforms, (tuple, list)):
195
+ transforms = T.TransformList(transforms)
196
+
197
+ # bbox is 1d (per-instance bounding box)
198
+ bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
199
+ bbox = transforms.apply_box(np.array([bbox]))[0]
200
+
201
+ annotation["bbox"] = bbox
202
+ annotation["bbox_mode"] = BoxMode.XYXY_ABS
203
+
204
+ if annotation['center_cam'][2] != 0:
205
+
206
+ # project the 3D box annotation XYZ_3D to screen
207
+ point3D = annotation['center_cam']
208
+ point2D = K @ np.array(point3D)
209
+ point2D[:2] = point2D[:2] / point2D[-1]
210
+ annotation["center_cam_proj"] = point2D.tolist()
211
+
212
+ # apply coords transforms to 2D box
213
+ annotation["center_cam_proj"][0:2] = transforms.apply_coords(
214
+ point2D[np.newaxis][:, :2]
215
+ )[0].tolist()
216
+
217
+ keypoints = (K @ np.array(annotation["bbox3D_cam"]).T).T
218
+ keypoints[:, 0] /= keypoints[:, -1]
219
+ keypoints[:, 1] /= keypoints[:, -1]
220
+
221
+ if annotation['ignore']:
222
+ # all keypoints marked as not visible
223
+ # 0 - unknown, 1 - not visible, 2 visible
224
+ keypoints[:, 2] = 1
225
+ else:
226
+
227
+ valid_keypoints = keypoints[:, 2] > 0
228
+
229
+ # 0 - unknown, 1 - not visible, 2 visible
230
+ keypoints[:, 2] = 2
231
+ keypoints[valid_keypoints, 2] = 2
232
+
233
+ # in place
234
+ transforms.apply_coords(keypoints[:, :2])
235
+ annotation["keypoints"] = keypoints.tolist()
236
+
237
+ # manually apply mirror for pose
238
+ for transform in transforms:
239
+
240
+ # horrizontal flip?
241
+ if isinstance(transform, T.HFlipTransform):
242
+
243
+ pose = _M1 @ np.array(annotation["pose"]) @ _M2
244
+ annotation["pose"] = pose.tolist()
245
+ annotation["R_cam"] = pose.tolist()
246
+
247
+ return annotation
248
+
249
+
250
+ def annotations_to_instances(annos, image_size, unknown_categories):
251
+
252
+ # init
253
+ target = Instances(image_size)
254
+
255
+ # add classes, 2D boxes, 3D boxes and poses
256
+ target.gt_classes = torch.tensor([int(obj["category_id"]) for obj in annos], dtype=torch.int64)
257
+ target.gt_boxes = Boxes([BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos])
258
+ target.gt_boxes3D = torch.FloatTensor([anno['center_cam_proj'] + anno['dimensions'] + anno['center_cam'] for anno in annos])
259
+ target.gt_poses = torch.FloatTensor([anno['pose'] for anno in annos])
260
+
261
+ n = len(target.gt_classes)
262
+
263
+ # do keypoints?
264
+ target.gt_keypoints = Keypoints(torch.FloatTensor([anno['keypoints'] for anno in annos]))
265
+
266
+ gt_unknown_category_mask = torch.zeros(max(unknown_categories)+1, dtype=bool)
267
+ gt_unknown_category_mask[torch.tensor(list(unknown_categories))] = True
268
+
269
+ # include available category indices as tensor with GTs
270
+ target.gt_unknown_category_mask = gt_unknown_category_mask.unsqueeze(0).repeat([n, 1])
271
+
272
+ return target
cubercnn/data/datasets.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ import json
3
+ import time
4
+ import os
5
+ import contextlib
6
+ import io
7
+ import logging
8
+ import numpy as np
9
+ import pandas as pd
10
+ from pycocotools.coco import COCO
11
+ from collections import defaultdict
12
+ from fvcore.common.timer import Timer
13
+ from detectron2.utils.file_io import PathManager
14
+ from detectron2.structures import BoxMode
15
+ from detectron2.data import MetadataCatalog, DatasetCatalog
16
+
17
+ from cubercnn import util
18
+
19
+ VERSION = '0.1'
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ def get_version():
24
+ return VERSION
25
+
26
+ def get_global_dataset_stats(path_to_stats=None, reset=False):
27
+
28
+ if path_to_stats is None:
29
+ path_to_stats = os.path.join('datasets', 'Omni3D', 'stats.json')
30
+
31
+ if os.path.exists(path_to_stats) and not reset:
32
+ stats = util.load_json(path_to_stats)
33
+
34
+ else:
35
+ stats = {
36
+ 'n_datasets': 0,
37
+ 'n_ims': 0,
38
+ 'n_anns': 0,
39
+ 'categories': []
40
+ }
41
+
42
+ return stats
43
+
44
+
45
+ def save_global_dataset_stats(stats, path_to_stats=None):
46
+
47
+ if path_to_stats is None:
48
+ path_to_stats = os.path.join('datasets', 'Omni3D', 'stats.json')
49
+
50
+ util.save_json(path_to_stats, stats)
51
+
52
+
53
+ def get_filter_settings_from_cfg(cfg=None):
54
+
55
+ if cfg is None:
56
+ return {
57
+ 'category_names': [],
58
+ 'ignore_names': [],
59
+ 'truncation_thres': 0.99,
60
+ 'visibility_thres': 0.01,
61
+ 'min_height_thres': 0.00,
62
+ 'max_height_thres': 1.50,
63
+ 'modal_2D_boxes': False,
64
+ 'trunc_2D_boxes': False,
65
+ 'max_depth': 1e8,
66
+ }
67
+ else:
68
+ return {
69
+ 'category_names': cfg.DATASETS.CATEGORY_NAMES,
70
+ 'ignore_names': cfg.DATASETS.IGNORE_NAMES,
71
+ 'truncation_thres': cfg.DATASETS.TRUNCATION_THRES,
72
+ 'visibility_thres': cfg.DATASETS.VISIBILITY_THRES,
73
+ 'min_height_thres': cfg.DATASETS.MIN_HEIGHT_THRES,
74
+ 'modal_2D_boxes': cfg.DATASETS.MODAL_2D_BOXES,
75
+ 'trunc_2D_boxes': cfg.DATASETS.TRUNC_2D_BOXES,
76
+ 'max_depth': cfg.DATASETS.MAX_DEPTH,
77
+
78
+ # TODO expose as a config
79
+ 'max_height_thres': 1.50,
80
+ }
81
+
82
+
83
+ def is_ignore(anno, filter_settings, image_height):
84
+
85
+ ignore = anno['behind_camera']
86
+ ignore |= (not bool(anno['valid3D']))
87
+
88
+ if ignore:
89
+ return ignore
90
+
91
+ ignore |= anno['dimensions'][0] <= 0.01
92
+ ignore |= anno['dimensions'][1] <= 0.01
93
+ ignore |= anno['dimensions'][2] <= 0.01
94
+ ignore |= anno['center_cam'][2] > filter_settings['max_depth']
95
+ ignore |= (anno['lidar_pts'] == 0)
96
+ ignore |= (anno['segmentation_pts'] == 0)
97
+ ignore |= (anno['depth_error'] > 0.5)
98
+
99
+ # tightly annotated 2D boxes are not always available.
100
+ if filter_settings['modal_2D_boxes'] and 'bbox2D_tight' in anno and anno['bbox2D_tight'][0] != -1:
101
+ bbox2D = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
102
+
103
+ # truncated projected 2D boxes are also not always available.
104
+ elif filter_settings['trunc_2D_boxes'] and 'bbox2D_trunc' in anno and not np.all([val==-1 for val in anno['bbox2D_trunc']]):
105
+ bbox2D = BoxMode.convert(anno['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
106
+
107
+ # use the projected 3D --> 2D box, which requires a visible 3D cuboid.
108
+ elif 'bbox2D_proj' in anno:
109
+ bbox2D = BoxMode.convert(anno['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
110
+
111
+ else:
112
+ bbox2D = anno['bbox']
113
+
114
+ ignore |= bbox2D[3] <= filter_settings['min_height_thres']*image_height
115
+ ignore |= bbox2D[3] >= filter_settings['max_height_thres']*image_height
116
+
117
+ ignore |= (anno['truncation'] >=0 and anno['truncation'] >= filter_settings['truncation_thres'])
118
+ ignore |= (anno['visibility'] >= 0 and anno['visibility'] <= filter_settings['visibility_thres'])
119
+
120
+ if 'ignore_names' in filter_settings:
121
+ ignore |= anno['category_name'] in filter_settings['ignore_names']
122
+
123
+ return ignore
124
+
125
+
126
+ def simple_register(dataset_name, filter_settings, filter_empty=True, datasets_root_path=None):
127
+
128
+ if datasets_root_path is None:
129
+ datasets_root_path = path_to_json = os.path.join('datasets', 'Omni3D',)
130
+
131
+ path_to_json = os.path.join(datasets_root_path, dataset_name + '.json')
132
+ path_to_image_root = 'datasets'
133
+
134
+ DatasetCatalog.register(dataset_name, lambda: load_omni3d_json(
135
+ path_to_json, path_to_image_root,
136
+ dataset_name, filter_settings, filter_empty=filter_empty
137
+ ))
138
+
139
+ MetadataCatalog.get(dataset_name).set(json_file=path_to_json, image_root=path_to_image_root, evaluator_type="coco")
140
+
141
+ class Omni3D(COCO):
142
+ '''
143
+ Class for COCO-like dataset object. Not inherently related to
144
+ use with Detectron2 or training per se.
145
+ '''
146
+
147
+ def __init__(self, annotation_files, filter_settings=None):
148
+
149
+ # load dataset
150
+ self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
151
+ self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
152
+
153
+ self.idx_without_ground = set(pd.read_csv('datasets/no_ground_idx.csv')['img_id'].values)
154
+
155
+ if isinstance(annotation_files, str):
156
+ annotation_files = [annotation_files,]
157
+
158
+ cats_ids_master = []
159
+ cats_master = []
160
+
161
+ for annotation_file in annotation_files:
162
+
163
+ _, name, _ = util.file_parts(annotation_file)
164
+
165
+ logger.info('loading {} annotations into memory...'.format(name))
166
+ dataset = json.load(open(annotation_file, 'r'))
167
+ assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
168
+
169
+ if type(dataset['info']) == list:
170
+ dataset['info'] = dataset['info'][0]
171
+
172
+ dataset['info']['known_category_ids'] = [cat['id'] for cat in dataset['categories']]
173
+
174
+ # first dataset
175
+ if len(self.dataset) == 0:
176
+ self.dataset = dataset
177
+
178
+ # concatenate datasets
179
+ else:
180
+
181
+ if type(self.dataset['info']) == dict:
182
+ self.dataset['info'] = [self.dataset['info']]
183
+
184
+ self.dataset['info'] += [dataset['info']]
185
+ self.dataset['annotations'] += dataset['annotations']
186
+ self.dataset['images'] += dataset['images']
187
+
188
+ # sort through categories
189
+ for cat in dataset['categories']:
190
+
191
+ if not cat['id'] in cats_ids_master:
192
+ cats_ids_master.append(cat['id'])
193
+ cats_master.append(cat)
194
+
195
+ if filter_settings is None:
196
+
197
+ # include every category in the master list
198
+ self.dataset['categories'] = [
199
+ cats_master[i]
200
+ for i in np.argsort(cats_ids_master)
201
+ ]
202
+
203
+ else:
204
+
205
+ # determine which categories we may actually use for filtering.
206
+ trainable_cats = set(filter_settings['ignore_names']) | set(filter_settings['category_names'])
207
+
208
+ # category names are provided to us
209
+ if len(filter_settings['category_names']) > 0:
210
+
211
+ self.dataset['categories'] = [
212
+ cats_master[i]
213
+ for i in np.argsort(cats_ids_master)
214
+ if cats_master[i]['name'] in filter_settings['category_names']
215
+ ]
216
+
217
+ # no categories are provided, so assume use ALL available.
218
+ else:
219
+
220
+ self.dataset['categories'] = [
221
+ cats_master[i]
222
+ for i in np.argsort(cats_ids_master)
223
+ ]
224
+
225
+ filter_settings['category_names'] = [cat['name'] for cat in self.dataset['categories']]
226
+
227
+ trainable_cats = trainable_cats | set(filter_settings['category_names'])
228
+
229
+ valid_anns = []
230
+ im_height_map = {}
231
+
232
+ for im_obj in self.dataset['images']:
233
+ im_height_map[im_obj['id']] = im_obj['height']
234
+
235
+ # Filter out annotations
236
+ for anno_idx, anno in enumerate(self.dataset['annotations']):
237
+
238
+ im_height = im_height_map[anno['image_id']]
239
+
240
+ ignore = is_ignore(anno, filter_settings, im_height)
241
+
242
+ if filter_settings['trunc_2D_boxes'] and 'bbox2D_trunc' in anno and not np.all([val==-1 for val in anno['bbox2D_trunc']]):
243
+ bbox2D = BoxMode.convert(anno['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
244
+
245
+ elif anno['bbox2D_proj'][0] != -1:
246
+ bbox2D = BoxMode.convert(anno['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
247
+
248
+ elif anno['bbox2D_tight'][0] != -1:
249
+ bbox2D = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
250
+
251
+ else:
252
+ continue
253
+
254
+ width = bbox2D[2]
255
+ height = bbox2D[3]
256
+
257
+ self.dataset['annotations'][anno_idx]['area'] = width*height
258
+ self.dataset['annotations'][anno_idx]['iscrowd'] = False
259
+ self.dataset['annotations'][anno_idx]['ignore'] = ignore
260
+ self.dataset['annotations'][anno_idx]['ignore2D'] = ignore
261
+ self.dataset['annotations'][anno_idx]['ignore3D'] = ignore
262
+
263
+ if filter_settings['modal_2D_boxes'] and anno['bbox2D_tight'][0] != -1:
264
+ self.dataset['annotations'][anno_idx]['bbox'] = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
265
+
266
+ else:
267
+ self.dataset['annotations'][anno_idx]['bbox'] = bbox2D
268
+
269
+ self.dataset['annotations'][anno_idx]['bbox3D'] = anno['bbox3D_cam']
270
+ self.dataset['annotations'][anno_idx]['depth'] = anno['center_cam'][2]
271
+
272
+ category_name = anno["category_name"]
273
+
274
+ # category is part of trainable categories?
275
+ if category_name in trainable_cats:
276
+ if not ignore:
277
+ valid_anns.append(self.dataset['annotations'][anno_idx])
278
+
279
+ self.dataset['annotations'] = valid_anns
280
+
281
+ # append depth image path to each image corresponding to the id
282
+ # for img in self.dataset['images']:
283
+ # img_id = img['id']
284
+ # img['depth_image_path'] = f'datasets/depth_maps/{img_id}.npz'
285
+ # if not img_id in self.idx_without_ground:
286
+ # img['ground_image_path'] = f'datasets/ground_maps/{img_id}.npz'
287
+
288
+ self.createIndex()
289
+
290
+ def info(self):
291
+
292
+ infos = self.dataset['info']
293
+ if type(infos) == dict:
294
+ infos = [infos]
295
+
296
+ for i, info in enumerate(infos):
297
+ print('Dataset {}/{}'.format(i+1, infos))
298
+
299
+ for key, value in info.items():
300
+ print('{}: {}'.format(key, value))
301
+
302
+
303
+ def register_and_store_model_metadata(datasets, output_dir, filter_settings=None):
304
+
305
+ output_file = os.path.join(output_dir, 'category_meta.json')
306
+
307
+ if os.path.exists(output_file):
308
+ metadata = util.load_json(output_file)
309
+ thing_classes = metadata['thing_classes']
310
+ id_map = metadata['thing_dataset_id_to_contiguous_id']
311
+
312
+ # json saves id map as strings rather than ints
313
+ id_map = {int(idA):idB for idA, idB in id_map.items()}
314
+
315
+ else:
316
+ omni3d_stats = util.load_json(os.path.join('datasets', 'Omni3D', 'stats.json'))
317
+ thing_classes = filter_settings['category_names']
318
+
319
+ cat_ids = []
320
+ for cat in thing_classes:
321
+ cat_idx = omni3d_stats['category_names'].index(cat)
322
+ cat_id = omni3d_stats['categories'][cat_idx]['id']
323
+ cat_ids.append(cat_id)
324
+
325
+ cat_order = np.argsort(cat_ids)
326
+ cat_ids = [cat_ids[i] for i in cat_order]
327
+ thing_classes = [thing_classes[i] for i in cat_order]
328
+ id_map = {id: i for i, id in enumerate(cat_ids)}
329
+
330
+ util.save_json(output_file, {
331
+ 'thing_classes': thing_classes,
332
+ 'thing_dataset_id_to_contiguous_id': id_map,
333
+ })
334
+
335
+ MetadataCatalog.get('omni3d_model').thing_classes = thing_classes
336
+ MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id = id_map
337
+
338
+
339
+ def load_omni3d_json(json_file, image_root, dataset_name, filter_settings, filter_empty=True):
340
+
341
+ # read in the dataset
342
+ timer = Timer()
343
+ json_file = PathManager.get_local_path(json_file)
344
+ with contextlib.redirect_stdout(io.StringIO()):
345
+ coco_api = COCO(json_file)
346
+ ground_map_files = os.listdir('datasets/ground_maps')
347
+ ground_idx = []
348
+ for file in ground_map_files:
349
+ try:
350
+ idx = int(file.split('.')[0])
351
+ ground_idx.append(idx)
352
+ except:
353
+ pass
354
+ depth_map_files = os.listdir('datasets/depth_maps')
355
+ depth_idx = []
356
+ for file in depth_map_files:
357
+ try:
358
+ idx = int(file.split('.')[0])
359
+ depth_idx.append(idx)
360
+ except:
361
+ pass
362
+ if timer.seconds() > 1:
363
+ logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
364
+
365
+ # the global meta information for the full dataset
366
+ meta_model = MetadataCatalog.get('omni3d_model')
367
+
368
+ # load the meta information
369
+ meta = MetadataCatalog.get(dataset_name)
370
+ cat_ids = sorted(coco_api.getCatIds(filter_settings['category_names']))
371
+ cats = coco_api.loadCats(cat_ids)
372
+ thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
373
+ meta.thing_classes = thing_classes
374
+
375
+ # the id mapping must be based on the model!
376
+ id_map = meta_model.thing_dataset_id_to_contiguous_id
377
+ meta.thing_dataset_id_to_contiguous_id = id_map
378
+
379
+ # sort indices for reproducible results
380
+ img_ids = sorted(coco_api.imgs.keys())
381
+ imgs = coco_api.loadImgs(img_ids)
382
+ anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
383
+ total_num_valid_anns = sum([len(x) for x in anns])
384
+ total_num_anns = len(coco_api.anns)
385
+ if total_num_valid_anns < total_num_anns:
386
+ logger.info(
387
+ f"{json_file} contains {total_num_anns} annotations, but only "
388
+ f"{total_num_valid_anns} of them match to images in the file."
389
+ )
390
+
391
+ imgs_anns = list(zip(imgs, anns))
392
+ logger.info("Loaded {} images in Omni3D format from {}".format(len(imgs_anns), json_file))
393
+
394
+ dataset_dicts = []
395
+
396
+ # annotation keys to pass along
397
+ ann_keys = [
398
+ "bbox", "bbox3D_cam", "bbox2D_proj", "bbox2D_trunc", "bbox2D_tight",
399
+ "center_cam", "dimensions", "pose", "R_cam", "category_id",
400
+ ]
401
+
402
+ # optional per image keys to pass if exists
403
+ # this property is unique to KITTI.
404
+ img_keys_optional = ['p2']
405
+
406
+ invalid_count = 0
407
+
408
+ for img_dict, anno_dict_list in imgs_anns:
409
+
410
+ has_valid_annotation = False
411
+
412
+ record = {}
413
+ record["file_name"] = os.path.join(image_root, img_dict["file_path"])
414
+ record["dataset_id"] = img_dict["dataset_id"]
415
+ record["height"] = img_dict["height"]
416
+ record["width"] = img_dict["width"]
417
+ record["K"] = img_dict["K"]
418
+
419
+ # store optional keys when available
420
+ for img_key in img_keys_optional:
421
+ if img_key in img_dict:
422
+ record[img_key] = img_dict[img_key]
423
+
424
+ image_id = record["image_id"] = img_dict["id"]
425
+
426
+ if image_id in depth_idx:
427
+ record["depth_image_path"] = f'datasets/depth_maps/{image_id}.npz'
428
+ if image_id in ground_idx:
429
+ record["ground_image_path"] = f'datasets/ground_maps/{image_id}.npz'
430
+ objs = []
431
+ # where invalid annotations are removed
432
+ for anno in anno_dict_list:
433
+ assert anno["image_id"] == image_id
434
+
435
+ obj = {key: anno[key] for key in ann_keys if key in anno}
436
+
437
+ obj["bbox_mode"] = BoxMode.XYWH_ABS
438
+ annotation_category_id = obj["category_id"]
439
+
440
+ # category is not part of ids and is not in the ignore category?
441
+ if not (annotation_category_id in id_map) and not (anno['category_name'] in filter_settings['ignore_names']):
442
+ continue
443
+
444
+ ignore = is_ignore(anno, filter_settings, img_dict["height"])
445
+
446
+ obj['iscrowd'] = False
447
+ obj['ignore'] = ignore
448
+
449
+ if filter_settings['modal_2D_boxes'] and 'bbox2D_tight' in anno and anno['bbox2D_tight'][0] != -1:
450
+ obj['bbox'] = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
451
+
452
+ elif filter_settings['trunc_2D_boxes'] and 'bbox2D_trunc' in anno and not np.all([val==-1 for val in anno['bbox2D_trunc']]):
453
+ obj['bbox'] = BoxMode.convert(anno['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
454
+
455
+ elif 'bbox2D_proj' in anno:
456
+ obj['bbox'] = BoxMode.convert(anno['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
457
+
458
+ else:
459
+ continue
460
+
461
+ obj['pose'] = anno['R_cam']
462
+
463
+ # store category as -1 for ignores!
464
+ # OLD Logic
465
+ obj["category_id"] = -1 if ignore else id_map[annotation_category_id]
466
+
467
+ objs.append(obj)
468
+
469
+ has_valid_annotation |= (not ignore)
470
+
471
+ if has_valid_annotation or (not filter_empty):
472
+ record["annotations"] = objs
473
+ dataset_dicts.append(record)
474
+
475
+ else:
476
+ invalid_count += 1
477
+
478
+ logger.info("Filtered out {}/{} images without valid annotations".format(invalid_count, len(imgs_anns)))
479
+
480
+ return dataset_dicts
cubercnn/data/filter_ground.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Basically a hotfix script to avoid having to run the ground segemntation script again
2
+ # this will filter out empty ground maps and add the indices to the no_ground_idx.csv file
3
+ # It removes ground maps with very little ground, because we assume that it has found something wrong
4
+ import os
5
+ import torch
6
+ import numpy as np
7
+ from tqdm import tqdm
8
+ import pandas as pd
9
+
10
+ files = os.listdir('datasets/ground_maps')
11
+ no_ground = []
12
+ for file in tqdm(files):
13
+ mask = np.load(f'datasets/ground_maps/{file}')['mask']
14
+ ground_map = torch.as_tensor(mask)[::5,::5]
15
+ nnz = torch.count_nonzero(ground_map).item()
16
+ # 100 is determined from looking at the pictures
17
+ if nnz < 100:
18
+ print(nnz)
19
+ print('indices', file[:-4])
20
+ no_ground.append(int(file[:-4]))
21
+ os.remove(f'datasets/ground_maps/{file}')
22
+
23
+ df = pd.DataFrame(no_ground, columns=['img_id'])
24
+ df2 = pd.read_csv('datasets/no_ground_idx.csv')
25
+ df = pd.concat([df, df2])
26
+ df.to_csv('datasets/no_ground_idx.csv', index=False)
cubercnn/data/generate_depth_maps.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import cv2
3
+ # might need to export PYTHONPATH=/work3/$username/3dod/
4
+ from depth.metric_depth.depth_anything_v2.dpt import DepthAnythingV2
5
+ def depth_of_images(encoder='vitl', dataset='hypersim', max_depth=20, device='cpu'):
6
+ """
7
+ This function takes in a list of images and returns the depth of the images
8
+
9
+ encoder = 'vitl' # or 'vits', 'vitb'
10
+ dataset = 'hypersim' # 'hypersim' for indoor model, 'vkitti' for outdoor model
11
+ max_depth = 20 # 20 for indoor model, 80 for outdoor model
12
+ """
13
+ model_configs = {
14
+ 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
15
+ 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
16
+ 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
17
+ }
18
+
19
+ model = DepthAnythingV2(**{**model_configs[encoder], 'max_depth': max_depth})
20
+ model.load_state_dict(torch.load(f'depth/checkpoints/depth_anything_v2_metric_{dataset}_{encoder}.pth', map_location=device, weights_only=False))
21
+ model.eval()
22
+ model.to(device)
23
+ return model
24
+
25
+ def init_dataset():
26
+ ''' dataloader stuff.
27
+ I'm not sure what the difference between the omni3d dataset and load omni3D json functions are. this is a 3rd alternative to this. The train script calls something similar to this.'''
28
+ cfg, filter_settings = get_config_and_filter_settings()
29
+
30
+ dataset_names = ['SUNRGBD_train','SUNRGBD_val','SUNRGBD_test', 'KITTI_train', 'KITTI_val', 'KITTI_test',]
31
+ dataset_paths_to_json = ['datasets/Omni3D/'+dataset_name+'.json' for dataset_name in dataset_names]
32
+ # for dataset_name in dataset_names:
33
+ # simple_register(dataset_name, filter_settings, filter_empty=True)
34
+
35
+ # Get Image and annotations
36
+ datasets = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
37
+ data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
38
+
39
+ thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
40
+ dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
41
+
42
+ infos = datasets.dataset['info']
43
+
44
+ dataset_id_to_unknown_cats = {}
45
+ possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
46
+
47
+ dataset_id_to_src = {}
48
+
49
+ for info in infos:
50
+ dataset_id = info['id']
51
+ known_category_training_ids = set()
52
+
53
+ if not dataset_id in dataset_id_to_src:
54
+ dataset_id_to_src[dataset_id] = info['source']
55
+
56
+ for id in info['known_category_ids']:
57
+ if id in dataset_id_to_contiguous_id:
58
+ known_category_training_ids.add(dataset_id_to_contiguous_id[id])
59
+
60
+ # determine and store the unknown categories.
61
+ unknown_categories = possible_categories - known_category_training_ids
62
+ dataset_id_to_unknown_cats[dataset_id] = unknown_categories
63
+
64
+ return datasets
65
+
66
+ if __name__ == '__main__':
67
+ import os
68
+ from detectron2.data.catalog import MetadataCatalog
69
+ import numpy as np
70
+
71
+ from cubercnn import data
72
+ from priors import get_config_and_filter_settings
73
+
74
+ from tqdm import tqdm
75
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
76
+ datasets = init_dataset()
77
+
78
+ os.makedirs('datasets/depth_maps', exist_ok=True)
79
+
80
+ model = depth_of_images(device=device)
81
+
82
+ for img_id, img_info in tqdm(datasets.imgs.items()):
83
+ file_path = img_info['file_path']
84
+ img = cv2.imread('datasets/'+file_path)
85
+ depth = model.infer_image(img) # HxW depth map in meters in numpy
86
+ np.savez_compressed(f'datasets/depth_maps/{img_id}.npz', depth=depth)
cubercnn/data/generate_ground_segmentations.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from segment_anything import sam_model_registry
2
+ from segment_anything.modeling import Sam
3
+ import os
4
+
5
+ def init_segmentation(device='cpu') -> Sam:
6
+ # 1) first cd into the segment_anything and pip install -e .
7
+ # to get the model stary in the root foler folder and run the download_model.sh
8
+ # 2) chmod +x download_model.sh && ./download_model.sh
9
+ # the largest model: https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
10
+ # this is the smallest model
11
+ if os.path.exists('sam-hq/sam_hq_vit_b.pth'):
12
+ sam_checkpoint = "sam-hq/sam_hq_vit_b.pth"
13
+ model_type = "vit_b"
14
+ else:
15
+ sam_checkpoint = "sam-hq/sam_hq_vit_tiny.pth"
16
+ model_type = "vit_tiny"
17
+ print(f'SAM device: {device}, model_type: {model_type}')
18
+ sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
19
+ sam.to(device=device)
20
+ return sam
21
+
22
+
23
+ if __name__ == '__main__':
24
+ from segment_anything.utils.transforms import ResizeLongestSide
25
+ import numpy as np
26
+ import pandas as pd
27
+ import torch
28
+ import torchvision.transforms as T2
29
+ from matplotlib import pyplot as plt
30
+ from PIL import Image
31
+ from tqdm import tqdm
32
+ from torchvision.ops import box_convert
33
+
34
+ import groundingdino.datasets.transforms as T
35
+ from cubercnn import data
36
+ from detectron2.data.catalog import MetadataCatalog
37
+ from groundingdino.util.inference import load_image, load_model, predict
38
+ from priors import get_config_and_filter_settings
39
+ import supervision as sv
40
+
41
+ def init_dataset():
42
+ ''' dataloader stuff.
43
+ currently not used anywhere, because I'm not sure what the difference between the omni3d dataset and load omni3D json functions are. this is a 3rd alternative to this. The train script calls something similar to this.'''
44
+ cfg, filter_settings = get_config_and_filter_settings()
45
+
46
+ dataset_names = ['SUNRGBD_train','SUNRGBD_val','SUNRGBD_test', 'KITTI_train', 'KITTI_val', 'KITTI_test',]
47
+ dataset_paths_to_json = ['datasets/Omni3D/'+dataset_name+'.json' for dataset_name in dataset_names]
48
+ # for dataset_name in dataset_names:
49
+ # simple_register(dataset_name, filter_settings, filter_empty=True)
50
+
51
+ # Get Image and annotations
52
+ datasets = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
53
+ data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
54
+
55
+
56
+ thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
57
+ dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
58
+
59
+ infos = datasets.dataset['info']
60
+
61
+ dataset_id_to_unknown_cats = {}
62
+ possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
63
+
64
+ dataset_id_to_src = {}
65
+
66
+ for info in infos:
67
+ dataset_id = info['id']
68
+ known_category_training_ids = set()
69
+
70
+ if not dataset_id in dataset_id_to_src:
71
+ dataset_id_to_src[dataset_id] = info['source']
72
+
73
+ for id in info['known_category_ids']:
74
+ if id in dataset_id_to_contiguous_id:
75
+ known_category_training_ids.add(dataset_id_to_contiguous_id[id])
76
+
77
+ # determine and store the unknown categories.
78
+ unknown_categories = possible_categories - known_category_training_ids
79
+ dataset_id_to_unknown_cats[dataset_id] = unknown_categories
80
+
81
+ return datasets
82
+
83
+ def load_image(image_path: str, device) -> tuple[torch.Tensor, torch.Tensor]:
84
+ transform = T.Compose(
85
+ [
86
+ # T.RandomResize([800], max_size=1333),
87
+ T.ToTensor(),
88
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
89
+ ]
90
+ )
91
+ transform2 = T2.ToTensor()
92
+ image_source = Image.open(image_path).convert("RGB")
93
+ image = transform2(image_source).to(device)
94
+ image_transformed, _ = transform(image_source, None)
95
+ return image, image_transformed.to(device)
96
+
97
+
98
+ def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: list[str]) -> np.ndarray:
99
+ """
100
+ This function annotates an image with bounding boxes and labels.
101
+
102
+ Parameters:
103
+ image_source (np.ndarray): The source image to be annotated.
104
+ boxes (torch.Tensor): A tensor containing bounding box coordinates.
105
+ logits (torch.Tensor): A tensor containing confidence scores for each bounding box.
106
+ phrases (List[str]): A list of labels for each bounding box.
107
+
108
+ Returns:
109
+ np.ndarray: The annotated image.
110
+ """
111
+ h, w, _ = image_source.shape
112
+ boxes = boxes * torch.Tensor([w, h, w, h])
113
+ xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
114
+ detections = sv.Detections(xyxy=xyxy)
115
+
116
+ labels = [
117
+ f"{phrase} {logit:.2f}"
118
+ for phrase, logit
119
+ in zip(phrases, logits)
120
+ ]
121
+
122
+ box_annotator = sv.BoxAnnotator()
123
+ # annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
124
+ annotated_frame = image_source.copy()
125
+ annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
126
+ return annotated_frame
127
+
128
+
129
+ datasets = init_dataset()
130
+
131
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
132
+ # model.to(device)
133
+
134
+ segmentor = init_segmentation(device=device)
135
+
136
+ os.makedirs('datasets/ground_maps', exist_ok=True)
137
+ model = load_model("GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", "GroundingDINO/weights/groundingdino_swint_ogc.pth", device=device)
138
+ TEXT_PROMPT = "ground"
139
+ BOX_TRESHOLD = 0.35
140
+ TEXT_TRESHOLD = 0.25
141
+
142
+ noground = 0
143
+ no_ground_idx = []
144
+
145
+ # **** to annotate full dataset ****
146
+ for img_id, img_info in tqdm(datasets.imgs.items()):
147
+ file_path = img_info['file_path']
148
+ w = img_info['width']
149
+ h = img_info['height']
150
+ # **** to annotate full dataset ****
151
+ # **** to annotate demo images ****
152
+ # for img_id in tqdm(os.listdir('datasets/coco_examples')):
153
+ # file_path = 'coco_examples/'+img_id
154
+ image_source, image = load_image('datasets/'+file_path, device=device)
155
+ # **** to annotate demo images ****
156
+
157
+ boxes, logits, phrases = predict(
158
+ model=model,
159
+ image=image,
160
+ caption=TEXT_PROMPT,
161
+ box_threshold=BOX_TRESHOLD,
162
+ text_threshold=TEXT_TRESHOLD,
163
+ device=device
164
+ )
165
+ if len(boxes) == 0:
166
+ print(f"No ground found for {img_id}")
167
+ noground += 1
168
+ # save a ground map that is all zeros
169
+ no_ground_idx.append(img_id)
170
+ continue
171
+ # only want box corresponding to max logit
172
+ max_logit_idx = torch.argmax(logits)
173
+ logit = logits[max_logit_idx].unsqueeze(0)
174
+ box = boxes[max_logit_idx].unsqueeze(0)
175
+ phrase = [phrases[max_logit_idx]]
176
+
177
+ _, h, w = image_source.shape
178
+ box = box * torch.tensor([w, h, w, h], device=device)
179
+ xyxy = box_convert(boxes=box, in_fmt="cxcywh", out_fmt="xyxy")
180
+
181
+ image = image.unsqueeze(0)
182
+ org_shape = image.shape[-2:]
183
+ resize_transform = ResizeLongestSide(segmentor.image_encoder.img_size)
184
+ batched_input = []
185
+ images = resize_transform.apply_image_torch(image*1.0)# .permute(2, 0, 1).contiguous()
186
+ for image, boxes in zip(images, xyxy):
187
+ transformed_boxes = resize_transform.apply_boxes_torch(boxes, org_shape) # Bx4
188
+ batched_input.append({'image': image, 'boxes': transformed_boxes, 'original_size':org_shape})
189
+
190
+ seg_out = segmentor(batched_input, multimask_output=False)
191
+ mask_per_image = seg_out[0]['masks']
192
+
193
+ nnz = torch.count_nonzero(mask_per_image, dim=(-2, -1))
194
+ indices = torch.nonzero(nnz <= 1000).flatten()
195
+ if len(indices) > 0:
196
+ noground += 1
197
+ # save a ground map that is all zeros
198
+ no_ground_idx.append(img_id)
199
+
200
+ np.savez_compressed(f'datasets/ground_maps/{img_id}.npz', mask=mask_per_image.cpu()[0,0,:,:].numpy())
201
+
202
+ print(f"Could not find ground for {noground} images")
203
+
204
+
205
+ df = pd.DataFrame(no_ground_idx, columns=['img_id'])
206
+ df.to_csv('datasets/no_ground_idx.csv', index=False)
cubercnn/evaluation/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .omni3d_evaluation import *
cubercnn/evaluation/omni3d_evaluation.py ADDED
@@ -0,0 +1,1706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ import contextlib
3
+ import copy
4
+ import datetime
5
+ import io
6
+ import itertools
7
+ import json
8
+ import logging
9
+ import os
10
+ import time
11
+ from collections import defaultdict
12
+ from typing import List, Union
13
+ from typing import Tuple
14
+
15
+ import numpy as np
16
+ import pycocotools.mask as maskUtils
17
+ import torch
18
+ from detectron2.utils.memory import retry_if_cuda_oom
19
+ from detectron2.data import MetadataCatalog, DatasetCatalog
20
+ from detectron2.evaluation.coco_evaluation import COCOEvaluator
21
+ from detectron2.structures import BoxMode
22
+ from detectron2.utils.file_io import PathManager
23
+ from detectron2.utils.logger import create_small_table, log_every_n_seconds
24
+ from pycocotools.cocoeval import COCOeval
25
+ from tabulate import tabulate
26
+ from detectron2.utils.comm import get_world_size, is_main_process
27
+ import detectron2.utils.comm as comm
28
+ from detectron2.evaluation import (
29
+ DatasetEvaluators, inference_context, DatasetEvaluator
30
+ )
31
+ from collections import OrderedDict, abc
32
+ from contextlib import ExitStack, contextmanager
33
+ from torch import nn
34
+
35
+ import logging
36
+ from cubercnn.data import Omni3D
37
+ from pytorch3d import _C
38
+ import torch.nn.functional as F
39
+
40
+ from pytorch3d.ops.iou_box3d import _box_planes, _box_triangles
41
+
42
+ import cubercnn.vis.logperf as utils_logperf
43
+ from cubercnn.data import (
44
+ get_omni3d_categories,
45
+ simple_register
46
+ )
47
+
48
+ """
49
+ This file contains
50
+ * Omni3DEvaluationHelper: a helper object to accumulate and summarize evaluation results
51
+ * Omni3DEval: a wrapper around COCOeval to perform 3D bounding evaluation in the detection setting
52
+ * Omni3DEvaluator: a wrapper around COCOEvaluator to collect results on each dataset
53
+ * Omni3DParams: parameters for the evaluation API
54
+ """
55
+
56
+ logger = logging.getLogger(__name__)
57
+
58
+ # Defines the max cross of len(dts) * len(gts)
59
+ # which we will attempt to compute on a GPU.
60
+ # Fallback is safer computation on a CPU.
61
+ # 0 is disabled on GPU.
62
+ MAX_DTS_CROSS_GTS_FOR_IOU3D = 0
63
+
64
+
65
+ def _check_coplanar(boxes: torch.Tensor, eps: float = 1e-4) -> torch.BoolTensor:
66
+ """
67
+ Checks that plane vertices are coplanar.
68
+ Returns a bool tensor of size B, where True indicates a box is coplanar.
69
+ """
70
+ faces = torch.tensor(_box_planes, dtype=torch.int64, device=boxes.device)
71
+ verts = boxes.index_select(index=faces.view(-1), dim=1)
72
+ B = boxes.shape[0]
73
+ P, V = faces.shape
74
+ # (B, P, 4, 3) -> (B, P, 3)
75
+ v0, v1, v2, v3 = verts.reshape(B, P, V, 3).unbind(2)
76
+
77
+ # Compute the normal
78
+ e0 = F.normalize(v1 - v0, dim=-1)
79
+ e1 = F.normalize(v2 - v0, dim=-1)
80
+ normal = F.normalize(torch.cross(e0, e1, dim=-1), dim=-1)
81
+
82
+ # Check the fourth vertex is also on the same plane
83
+ mat1 = (v3 - v0).view(B, 1, -1) # (B, 1, P*3)
84
+ mat2 = normal.view(B, -1, 1) # (B, P*3, 1)
85
+
86
+ return (mat1.bmm(mat2).abs() < eps).view(B)
87
+
88
+
89
+ def _check_nonzero(boxes: torch.Tensor, eps: float = 1e-8) -> torch.BoolTensor:
90
+ """
91
+ Checks that the sides of the box have a non zero area.
92
+ Returns a bool tensor of size B, where True indicates a box is nonzero.
93
+ """
94
+ faces = torch.tensor(_box_triangles, dtype=torch.int64, device=boxes.device)
95
+ verts = boxes.index_select(index=faces.view(-1), dim=1)
96
+ B = boxes.shape[0]
97
+ T, V = faces.shape
98
+ # (B, T, 3, 3) -> (B, T, 3)
99
+ v0, v1, v2 = verts.reshape(B, T, V, 3).unbind(2)
100
+
101
+ normals = torch.cross(v1 - v0, v2 - v0, dim=-1) # (B, T, 3)
102
+ face_areas = normals.norm(dim=-1) / 2
103
+
104
+ return (face_areas > eps).all(1).view(B)
105
+
106
+ def box3d_overlap(
107
+ boxes_dt: torch.Tensor, boxes_gt: torch.Tensor,
108
+ eps_coplanar: float = 1e-4, eps_nonzero: float = 1e-8
109
+ ) -> torch.Tensor:
110
+ """
111
+ Computes the intersection of 3D boxes_dt and boxes_gt.
112
+
113
+ Inputs boxes_dt, boxes_gt are tensors of shape (B, 8, 3)
114
+ (where B doesn't have to be the same for boxes_dt and boxes_gt),
115
+ containing the 8 corners of the boxes, as follows:
116
+
117
+ (4) +---------+. (5)
118
+ | ` . | ` .
119
+ | (0) +---+-----+ (1)
120
+ | | | |
121
+ (7) +-----+---+. (6)|
122
+ ` . | ` . |
123
+ (3) ` +---------+ (2)
124
+
125
+
126
+ NOTE: Throughout this implementation, we assume that boxes
127
+ are defined by their 8 corners exactly in the order specified in the
128
+ diagram above for the function to give correct results. In addition
129
+ the vertices on each plane must be coplanar.
130
+ As an alternative to the diagram, this is a unit bounding
131
+ box which has the correct vertex ordering:
132
+
133
+ box_corner_vertices = [
134
+ [0, 0, 0],
135
+ [1, 0, 0],
136
+ [1, 1, 0],
137
+ [0, 1, 0],
138
+ [0, 0, 1],
139
+ [1, 0, 1],
140
+ [1, 1, 1],
141
+ [0, 1, 1],
142
+ ]
143
+
144
+ Args:
145
+ boxes_dt: tensor of shape (N, 8, 3) of the coordinates of the 1st boxes
146
+ boxes_gt: tensor of shape (M, 8, 3) of the coordinates of the 2nd boxes
147
+ Returns:
148
+ iou: (N, M) tensor of the intersection over union which is
149
+ defined as: `iou = vol / (vol1 + vol2 - vol)`
150
+ """
151
+ # Make sure predictions are coplanar and nonzero
152
+ invalid_coplanar = ~_check_coplanar(boxes_dt, eps=eps_coplanar)
153
+ invalid_nonzero = ~_check_nonzero(boxes_dt, eps=eps_nonzero)
154
+
155
+ ious = _C.iou_box3d(boxes_dt, boxes_gt)[1]
156
+
157
+ # Offending boxes are set to zero IoU
158
+ if invalid_coplanar.any():
159
+ ious[invalid_coplanar] = 0
160
+ print('Warning: skipping {:d} non-coplanar boxes at eval.'.format(int(invalid_coplanar.float().sum())))
161
+
162
+ if invalid_nonzero.any():
163
+ ious[invalid_nonzero] = 0
164
+ print('Warning: skipping {:d} zero volume boxes at eval.'.format(int(invalid_nonzero.float().sum())))
165
+
166
+ return ious
167
+
168
+ class Omni3DEvaluationHelper:
169
+ def __init__(self,
170
+ dataset_names,
171
+ filter_settings,
172
+ output_folder,
173
+ iter_label='-',
174
+ only_2d=False,
175
+ ):
176
+ """
177
+ A helper class to initialize, evaluate and summarize Omni3D metrics.
178
+
179
+ The evaluator relies on the detectron2 MetadataCatalog for keeping track
180
+ of category names and contiguous IDs. Hence, it is important to set
181
+ these variables appropriately.
182
+
183
+ # (list[str]) the category names in their contiguous order
184
+ MetadataCatalog.get('omni3d_model').thing_classes = ...
185
+
186
+ # (dict[int: int]) the mapping from Omni3D category IDs to the contiguous order
187
+ MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
188
+
189
+ Args:
190
+ dataset_names (list[str]): the individual dataset splits for evaluation
191
+ filter_settings (dict): the filter settings used for evaluation, see
192
+ cubercnn/data/datasets.py get_filter_settings_from_cfg
193
+ output_folder (str): the output folder where results can be stored to disk.
194
+ iter_label (str): an optional iteration/label used within the summary
195
+ only_2d (bool): whether the evaluation mode should be 2D or 2D and 3D.
196
+ """
197
+
198
+ self.dataset_names = dataset_names
199
+ self.filter_settings = filter_settings
200
+ self.output_folder = output_folder
201
+ self.iter_label = iter_label
202
+ self.only_2d = only_2d
203
+
204
+ # Each dataset evaluator is stored here
205
+ self.evaluators = OrderedDict()
206
+
207
+ # These are the main evaluation results
208
+ self.results = OrderedDict()
209
+
210
+ # These store store per-dataset results to be printed
211
+ self.results_analysis = OrderedDict()
212
+ self.results_omni3d = OrderedDict()
213
+
214
+ self.overall_imgIds = set()
215
+ self.overall_catIds = set()
216
+
217
+ # These store the evaluations for each category and area,
218
+ # concatenated from ALL evaluated datasets. Doing so avoids
219
+ # the need to re-compute them when accumulating results.
220
+ self.evals_per_cat_area2D = {}
221
+ self.evals_per_cat_area3D = {}
222
+
223
+ self.output_folders = {
224
+ dataset_name: os.path.join(self.output_folder, dataset_name)
225
+ for dataset_name in dataset_names
226
+ }
227
+
228
+ for dataset_name in self.dataset_names:
229
+
230
+ # register any datasets that need it
231
+ if MetadataCatalog.get(dataset_name).get('json_file') is None:
232
+ simple_register(dataset_name, filter_settings, filter_empty=False)
233
+
234
+ # create an individual dataset evaluator
235
+ self.evaluators[dataset_name] = Omni3DEvaluator(
236
+ dataset_name, output_dir=self.output_folders[dataset_name],
237
+ filter_settings=self.filter_settings, only_2d=self.only_2d,
238
+ eval_prox=('Objectron' in dataset_name or 'SUNRGBD' in dataset_name),
239
+ distributed=False, # actual evaluation should be single process
240
+ )
241
+
242
+ self.evaluators[dataset_name].reset()
243
+ self.overall_imgIds.update(set(self.evaluators[dataset_name]._omni_api.getImgIds()))
244
+ self.overall_catIds.update(set(self.evaluators[dataset_name]._omni_api.getCatIds()))
245
+
246
+ def add_predictions(self, dataset_name, predictions):
247
+ """
248
+ Adds predictions to the evaluator for dataset_name. This can be any number of
249
+ predictions, including all predictions passed in at once or in batches.
250
+
251
+ Args:
252
+ dataset_name (str): the dataset split name which the predictions belong to
253
+ predictions (list[dict]): each item in the list is a dict as follows:
254
+
255
+ {
256
+ "image_id": <int> the unique image identifier from Omni3D,
257
+ "K": <np.array> 3x3 intrinsics matrix for the image,
258
+ "width": <int> image width,
259
+ "height": <int> image height,
260
+ "instances": [
261
+ {
262
+ "image_id": <int> the unique image identifier from Omni3D,
263
+ "category_id": <int> the contiguous category prediction IDs,
264
+ which can be mapped from Omni3D's category ID's using
265
+ MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
266
+ "bbox": [float] 2D box as [x1, y1, x2, y2] used for IoU2D,
267
+ "score": <float> the confidence score for the object,
268
+ "depth": <float> the depth of the center of the object,
269
+ "bbox3D": list[list[float]] 8x3 corner vertices used for IoU3D,
270
+ }
271
+ ...
272
+ ]
273
+ }
274
+ """
275
+ # concatenate incoming predictions
276
+ self.evaluators[dataset_name]._predictions += predictions
277
+
278
+ def save_predictions(self, dataset_name):
279
+ """
280
+ Saves the predictions from dataset_name to disk, in a self.output_folder.
281
+
282
+ Args:
283
+ dataset_name (str): the dataset split name which should be saved.
284
+ """
285
+ # save predictions to disk
286
+ output_folder_dataset = self.output_folders[dataset_name]
287
+ PathManager.mkdirs(output_folder_dataset)
288
+ file_path = os.path.join(output_folder_dataset, "instances_predictions.pth")
289
+ with PathManager.open(file_path, "wb") as f:
290
+ torch.save(self.evaluators[dataset_name]._predictions, f)
291
+
292
+ def evaluate(self, dataset_name):
293
+ """
294
+ Runs the evaluation for an individual dataset split, assuming all
295
+ predictions have been passed in.
296
+
297
+ Args:
298
+ dataset_name (str): the dataset split name which should be evalated.
299
+ """
300
+
301
+ if not dataset_name in self.results:
302
+
303
+ # run evaluation and cache
304
+ self.results[dataset_name] = self.evaluators[dataset_name].evaluate()
305
+
306
+ results = self.results[dataset_name]
307
+
308
+ logger.info('\n'+results['log_str_2D'].replace('mode=2D', '{} iter={} mode=2D'.format(dataset_name, self.iter_label)))
309
+
310
+ # store the partially accumulated evaluations per category per area
311
+ for key, item in results['bbox_2D_evals_per_cat_area'].items():
312
+ if not key in self.evals_per_cat_area2D:
313
+ self.evals_per_cat_area2D[key] = []
314
+ self.evals_per_cat_area2D[key] += item
315
+
316
+ if not self.only_2d:
317
+ # store the partially accumulated evaluations per category per area
318
+ for key, item in results['bbox_3D_evals_per_cat_area'].items():
319
+ if not key in self.evals_per_cat_area3D:
320
+ self.evals_per_cat_area3D[key] = []
321
+ self.evals_per_cat_area3D[key] += item
322
+
323
+ logger.info('\n'+results['log_str_3D'].replace('mode=3D', '{} iter={} mode=3D'.format(dataset_name, self.iter_label)))
324
+
325
+ # full model category names
326
+ category_names = self.filter_settings['category_names']
327
+
328
+ # The set of categories present in the dataset; there should be no duplicates
329
+ categories = {cat for cat in category_names if 'AP-{}'.format(cat) in results['bbox_2D']}
330
+ assert len(categories) == len(set(categories))
331
+
332
+ # default are all NaN
333
+ general_2D, general_3D, omni_2D, omni_3D = (np.nan,) * 4
334
+
335
+ # 2D and 3D performance for categories in dataset; and log
336
+ general_2D = np.mean([results['bbox_2D']['AP-{}'.format(cat)] for cat in categories])
337
+ if not self.only_2d:
338
+ general_3D = np.mean([results['bbox_3D']['AP-{}'.format(cat)] for cat in categories])
339
+
340
+ # 2D and 3D performance on Omni3D categories
341
+ omni3d_dataset_categories = get_omni3d_categories(dataset_name) # dataset-specific categories
342
+ if len(omni3d_dataset_categories - categories) == 0: # omni3d_dataset_categories is a subset of categories
343
+ omni_2D = np.mean([results['bbox_2D']['AP-{}'.format(cat)] for cat in omni3d_dataset_categories])
344
+ if not self.only_2d:
345
+ omni_3D = np.mean([results['bbox_3D']['AP-{}'.format(cat)] for cat in omni3d_dataset_categories])
346
+
347
+ self.results_omni3d[dataset_name] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
348
+
349
+ # Performance analysis
350
+ extras_AP15, extras_AP25, extras_AP50, extras_APn, extras_APm, extras_APf = (np.nan,)*6
351
+ if not self.only_2d:
352
+ extras_AP15 = results['bbox_3D']['AP15']
353
+ extras_AP25 = results['bbox_3D']['AP25']
354
+ extras_AP50 = results['bbox_3D']['AP50']
355
+ extras_APn = results['bbox_3D']['APn']
356
+ extras_APm = results['bbox_3D']['APm']
357
+ extras_APf = results['bbox_3D']['APf']
358
+
359
+ self.results_analysis[dataset_name] = {
360
+ "iters": self.iter_label,
361
+ "AP2D": general_2D, "AP3D": general_3D,
362
+ "AP3D@15": extras_AP15, "AP3D@25": extras_AP25, "AP3D@50": extras_AP50,
363
+ "AP3D-N": extras_APn, "AP3D-M": extras_APm, "AP3D-F": extras_APf
364
+ }
365
+
366
+ # Performance per category
367
+ results_cat = OrderedDict()
368
+ for cat in category_names:
369
+ cat_2D, cat_3D = (np.nan,) * 2
370
+ if 'AP-{}'.format(cat) in results['bbox_2D']:
371
+ cat_2D = results['bbox_2D']['AP-{}'.format(cat)]
372
+ if not self.only_2d:
373
+ cat_3D = results['bbox_3D']['AP-{}'.format(cat)]
374
+ if not np.isnan(cat_2D) or not np.isnan(cat_3D):
375
+ results_cat[cat] = {"AP2D": cat_2D, "AP3D": cat_3D}
376
+ utils_logperf.print_ap_category_histogram(dataset_name, results_cat)
377
+
378
+ def summarize_all(self,):
379
+ '''
380
+ Report collective metrics when possible for the the Omni3D dataset.
381
+ This uses pre-computed evaluation results from each dataset,
382
+ which were aggregated and cached while evaluating individually.
383
+ This process simply re-accumulate and summarizes them.
384
+ '''
385
+
386
+ # First, double check that we have all the evaluations
387
+ for dataset_name in self.dataset_names:
388
+ if not dataset_name in self.results:
389
+ self.evaluate(dataset_name)
390
+
391
+ thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
392
+ catId2contiguous = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
393
+ ordered_things = [thing_classes[catId2contiguous[cid]] for cid in self.overall_catIds]
394
+ categories = set(ordered_things)
395
+
396
+ evaluator2D = Omni3Deval(mode='2D')
397
+ evaluator2D.params.catIds = list(self.overall_catIds)
398
+ evaluator2D.params.imgIds = list(self.overall_imgIds)
399
+ evaluator2D.evalImgs = True
400
+ evaluator2D.evals_per_cat_area = self.evals_per_cat_area2D
401
+ evaluator2D._paramsEval = copy.deepcopy(evaluator2D.params)
402
+ evaluator2D.accumulate()
403
+ summarize_str2D = evaluator2D.summarize()
404
+
405
+ precisions = evaluator2D.eval['precision']
406
+
407
+ metrics = ["AP", "AP50", "AP75", "AP95", "APs", "APm", "APl"]
408
+
409
+ results2D = {
410
+ metric: float(
411
+ evaluator2D.stats[idx] * 100 if evaluator2D.stats[idx] >= 0 else "nan"
412
+ )
413
+ for idx, metric in enumerate(metrics)
414
+ }
415
+
416
+ for idx, name in enumerate(ordered_things):
417
+ precision = precisions[:, :, idx, 0, -1]
418
+ precision = precision[precision > -1]
419
+ ap = np.mean(precision) if precision.size else float("nan")
420
+ results2D.update({"AP-" + "{}".format(name): float(ap * 100)})
421
+
422
+ if not self.only_2d:
423
+ evaluator3D = Omni3Deval(mode='3D')
424
+ evaluator3D.params.catIds = list(self.overall_catIds)
425
+ evaluator3D.params.imgIds = list(self.overall_imgIds)
426
+ evaluator3D.evalImgs = True
427
+ evaluator3D.evals_per_cat_area = self.evals_per_cat_area3D
428
+ evaluator3D._paramsEval = copy.deepcopy(evaluator3D.params)
429
+ evaluator3D.accumulate()
430
+ summarize_str3D = evaluator3D.summarize()
431
+
432
+ precisions = evaluator3D.eval['precision']
433
+
434
+ metrics = ["AP", "AP15", "AP25", "AP50", "APn", "APm", "APf"]
435
+
436
+ results3D = {
437
+ metric: float(
438
+ evaluator3D.stats[idx] * 100 if evaluator3D.stats[idx] >= 0 else "nan"
439
+ )
440
+ for idx, metric in enumerate(metrics)
441
+ }
442
+
443
+ for idx, name in enumerate(ordered_things):
444
+ precision = precisions[:, :, idx, 0, -1]
445
+ precision = precision[precision > -1]
446
+ ap = np.mean(precision) if precision.size else float("nan")
447
+ results3D.update({"AP-" + "{}".format(name): float(ap * 100)})
448
+
449
+
450
+ # All concat categories
451
+ general_2D, general_3D = (np.nan,) * 2
452
+
453
+ general_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in categories])
454
+ if not self.only_2d:
455
+ general_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in categories])
456
+
457
+ # Analysis performance
458
+ extras_AP15, extras_AP25, extras_AP50, extras_APn, extras_APm, extras_APf = (np.nan,) * 6
459
+ if not self.only_2d:
460
+ extras_AP15 = results3D['AP15']
461
+ extras_AP25 = results3D['AP25']
462
+ extras_AP50 = results3D['AP50']
463
+ extras_APn = results3D['APn']
464
+ extras_APm = results3D['APm']
465
+ extras_APf = results3D['APf']
466
+
467
+ self.results_analysis["<Concat>"] = {
468
+ "iters": self.iter_label,
469
+ "AP2D": general_2D, "AP3D": general_3D,
470
+ "AP3D@15": extras_AP15, "AP3D@25": extras_AP25, "AP3D@50": extras_AP50,
471
+ "AP3D-N": extras_APn, "AP3D-M": extras_APm, "AP3D-F": extras_APf
472
+ }
473
+
474
+ # Omni3D Outdoor performance
475
+ omni_2D, omni_3D = (np.nan,) * 2
476
+
477
+ omni3d_outdoor_categories = get_omni3d_categories("omni3d_out")
478
+ if len(omni3d_outdoor_categories - categories) == 0:
479
+ omni_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in omni3d_outdoor_categories])
480
+ if not self.only_2d:
481
+ omni_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in omni3d_outdoor_categories])
482
+
483
+ self.results_omni3d["Omni3D_Out"] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
484
+
485
+ # Omni3D Indoor performance
486
+ omni_2D, omni_3D = (np.nan,) * 2
487
+
488
+ omni3d_indoor_categories = get_omni3d_categories("omni3d_in")
489
+ if len(omni3d_indoor_categories - categories) == 0:
490
+ omni_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in omni3d_indoor_categories])
491
+ if not self.only_2d:
492
+ omni_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in omni3d_indoor_categories])
493
+
494
+ self.results_omni3d["Omni3D_In"] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
495
+
496
+ # Omni3D performance
497
+ omni_2D, omni_3D = (np.nan,) * 2
498
+
499
+ omni3d_categories = get_omni3d_categories("omni3d")
500
+ if len(omni3d_categories - categories) == 0:
501
+ omni_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in omni3d_categories])
502
+ if not self.only_2d:
503
+ omni_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in omni3d_categories])
504
+
505
+ self.results_omni3d["Omni3D"] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
506
+
507
+ # Per-category performance for the cumulative datasets
508
+ results_cat = OrderedDict()
509
+ for cat in self.filter_settings['category_names']:
510
+ cat_2D, cat_3D = (np.nan,) * 2
511
+ if 'AP-{}'.format(cat) in results2D:
512
+ cat_2D = results2D['AP-{}'.format(cat)]
513
+ if not self.only_2d:
514
+ cat_3D = results3D['AP-{}'.format(cat)]
515
+ if not np.isnan(cat_2D) or not np.isnan(cat_3D):
516
+ results_cat[cat] = {"AP2D": cat_2D, "AP3D": cat_3D}
517
+
518
+ utils_logperf.print_ap_category_histogram("<Concat>", results_cat)
519
+ utils_logperf.print_ap_analysis_histogram(self.results_analysis)
520
+ utils_logperf.print_ap_omni_histogram(self.results_omni3d)
521
+
522
+
523
+ def inference_on_dataset(model, data_loader):
524
+ """
525
+ Run model on the data_loader.
526
+ Also benchmark the inference speed of `model.__call__` accurately.
527
+ The model will be used in eval mode.
528
+
529
+ Args:
530
+ model (callable): a callable which takes an object from
531
+ `data_loader` and returns some outputs.
532
+
533
+ If it's an nn.Module, it will be temporarily set to `eval` mode.
534
+ If you wish to evaluate a model in `training` mode instead, you can
535
+ wrap the given model and override its behavior of `.eval()` and `.train()`.
536
+ data_loader: an iterable object with a length.
537
+ The elements it generates will be the inputs to the model.
538
+
539
+ Returns:
540
+ The return value of `evaluator.evaluate()`
541
+ """
542
+
543
+ num_devices = get_world_size()
544
+ distributed = num_devices > 1
545
+ logger.info("Start inference on {} batches".format(len(data_loader)))
546
+
547
+ total = len(data_loader) # inference data loader must have a fixed length
548
+
549
+ num_warmup = min(5, total - 1)
550
+ start_time = time.perf_counter()
551
+ total_data_time = 0
552
+ total_compute_time = 0
553
+ total_eval_time = 0
554
+
555
+ inference_json = []
556
+
557
+ with ExitStack() as stack:
558
+ if isinstance(model, nn.Module):
559
+ stack.enter_context(inference_context(model))
560
+ stack.enter_context(torch.no_grad())
561
+
562
+ start_data_time = time.perf_counter()
563
+ for idx, inputs in enumerate(data_loader):
564
+ total_data_time += time.perf_counter() - start_data_time
565
+ if idx == num_warmup:
566
+ start_time = time.perf_counter()
567
+ total_data_time = 0
568
+ total_compute_time = 0
569
+ total_eval_time = 0
570
+
571
+ start_compute_time = time.perf_counter()
572
+ outputs = model(inputs)
573
+ if torch.cuda.is_available():
574
+ torch.cuda.synchronize()
575
+ total_compute_time += time.perf_counter() - start_compute_time
576
+
577
+ start_eval_time = time.perf_counter()
578
+
579
+ for input, output in zip(inputs, outputs):
580
+
581
+ prediction = {
582
+ "image_id": input["image_id"],
583
+ "K": input["K"],
584
+ "width": input["width"],
585
+ "height": input["height"],
586
+ }
587
+
588
+ # convert to json format
589
+ instances = output["instances"].to('cpu')
590
+ prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
591
+
592
+ # store in overall predictions
593
+ inference_json.append(prediction)
594
+
595
+ total_eval_time += time.perf_counter() - start_eval_time
596
+
597
+ iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
598
+ data_seconds_per_iter = total_data_time / iters_after_start
599
+ compute_seconds_per_iter = total_compute_time / iters_after_start
600
+ eval_seconds_per_iter = total_eval_time / iters_after_start
601
+ total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
602
+ if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
603
+ eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1)))
604
+ log_every_n_seconds(
605
+ logging.INFO,
606
+ (
607
+ f"Inference done {idx + 1}/{total}. "
608
+ f"Dataloading: {data_seconds_per_iter:.4f} s/iter. "
609
+ f"Inference: {compute_seconds_per_iter:.4f} s/iter. "
610
+ f"Eval: {eval_seconds_per_iter:.4f} s/iter. "
611
+ f"Total: {total_seconds_per_iter:.4f} s/iter. "
612
+ f"ETA={eta}"
613
+ ),
614
+ n=5,
615
+ )
616
+ start_data_time = time.perf_counter()
617
+
618
+ # Measure the time only for this worker (before the synchronization barrier)
619
+ total_time = time.perf_counter() - start_time
620
+ total_time_str = str(datetime.timedelta(seconds=total_time))
621
+ # NOTE this format is parsed by grep
622
+ logger.info(
623
+ "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
624
+ total_time_str, total_time / (total - num_warmup), num_devices
625
+ )
626
+ )
627
+ total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
628
+ logger.info(
629
+ "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
630
+ total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
631
+ )
632
+ )
633
+
634
+ if distributed:
635
+ comm.synchronize()
636
+ inference_json = comm.gather(inference_json, dst=0)
637
+ inference_json = list(itertools.chain(*inference_json))
638
+
639
+ if not comm.is_main_process():
640
+ return []
641
+
642
+ return inference_json
643
+
644
+ class Omni3DEvaluator(COCOEvaluator):
645
+ def __init__(
646
+ self,
647
+ dataset_name,
648
+ tasks=None,
649
+ distributed=True,
650
+ output_dir=None,
651
+ *,
652
+ max_dets_per_image=None,
653
+ use_fast_impl=False,
654
+ eval_prox=False,
655
+ only_2d=False,
656
+ filter_settings={},
657
+ ):
658
+ """
659
+ Args:
660
+ dataset_name (str): name of the dataset to be evaluated.
661
+ It must have either the following corresponding metadata:
662
+ "json_file": the path to the COCO format annotation
663
+ Or it must be in detectron2's standard dataset format
664
+ so it can be converted to COCO format automatically.
665
+ tasks (tuple[str]): tasks that can be evaluated under the given
666
+ configuration. For now, support only for "bbox".
667
+ distributed (True): if True, will collect results from all ranks and run evaluation
668
+ in the main process.
669
+ Otherwise, will only evaluate the results in the current process.
670
+ output_dir (str): optional, an output directory to dump all
671
+ results predicted on the dataset. The dump contains two files:
672
+ 1. "instances_predictions.pth" a file that can be loaded with `torch.load` and
673
+ contains all the results in the format they are produced by the model.
674
+ 2. "coco_instances_results.json" a json file in COCO's result format.
675
+ max_dets_per_image (int): limit on the maximum number of detections per image.
676
+ By default in COCO, this limit is to 100, but this can be customized
677
+ to be greater, as is needed in evaluation metrics AP fixed and AP pool
678
+ (see https://arxiv.org/pdf/2102.01066.pdf)
679
+ This doesn't affect keypoint evaluation.
680
+ use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
681
+ Although the results should be very close to the official implementation in COCO
682
+ API, it is still recommended to compute results with the official API for use in
683
+ papers. The faster implementation also uses more RAM.
684
+ eval_prox (bool): whether to perform proximity evaluation. For datasets that are not
685
+ exhaustively annotated.
686
+ only_2d (bool): evaluates only 2D performance if set to True
687
+ filter_settions: settings for the dataset loader. TBD
688
+ """
689
+
690
+ self._logger = logging.getLogger(__name__)
691
+ self._distributed = distributed
692
+ self._output_dir = output_dir
693
+ self._use_fast_impl = use_fast_impl
694
+ self._eval_prox = eval_prox
695
+ self._only_2d = only_2d
696
+ self._filter_settings = filter_settings
697
+
698
+ # COCOeval requires the limit on the number of detections per image (maxDets) to be a list
699
+ # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the
700
+ # 3rd element (100) is used as the limit on the number of detections per image when
701
+ # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval,
702
+ # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults.
703
+ if max_dets_per_image is None:
704
+ max_dets_per_image = [1, 10, 100]
705
+
706
+ else:
707
+ max_dets_per_image = [1, 10, max_dets_per_image]
708
+
709
+ self._max_dets_per_image = max_dets_per_image
710
+
711
+ self._tasks = tasks
712
+ self._cpu_device = torch.device("cpu")
713
+
714
+ self._metadata = MetadataCatalog.get(dataset_name)
715
+
716
+ json_file = PathManager.get_local_path(self._metadata.json_file)
717
+ with contextlib.redirect_stdout(io.StringIO()):
718
+ self._omni_api = Omni3D([json_file], filter_settings)
719
+
720
+ # Test set json files do not contain annotations (evaluation must be
721
+ # performed using the COCO evaluation server).
722
+ self._do_evaluation = "annotations" in self._omni_api.dataset
723
+
724
+ def process(self, inputs, outputs):
725
+ """
726
+ Args:
727
+ inputs: the inputs to a model (e.g., GeneralizedRCNN).
728
+ It is a list of dict. Each dict corresponds to an image and
729
+ contains keys like "height", "width", "file_name", "image_id".
730
+ outputs: the outputs of a model. It is a list of dicts with key
731
+ "instances" that contains :class:`Instances`.
732
+ """
733
+
734
+ # Optional image keys to keep when available
735
+ img_keys_optional = ["p2"]
736
+
737
+ for input, output in zip(inputs, outputs):
738
+
739
+ prediction = {
740
+ "image_id": input["image_id"],
741
+ "K": input["K"],
742
+ "width": input["width"],
743
+ "height": input["height"],
744
+ }
745
+
746
+ # store optional keys when available
747
+ for img_key in img_keys_optional:
748
+ if img_key in input:
749
+ prediction.update({img_key: input[img_key]})
750
+
751
+ # already in COCO format
752
+ if type(output["instances"]) == list:
753
+ prediction["instances"] = output["instances"]
754
+
755
+ # tensor instances format
756
+ else:
757
+ instances = output["instances"].to(self._cpu_device)
758
+ prediction["instances"] = instances_to_coco_json(
759
+ instances, input["image_id"]
760
+ )
761
+
762
+ if len(prediction) > 1:
763
+ self._predictions.append(prediction)
764
+
765
+ def _derive_omni_results(self, omni_eval, iou_type, mode, class_names=None):
766
+ """
767
+ Derive the desired score numbers from summarized COCOeval.
768
+ Args:
769
+ omni_eval (None or Omni3Deval): None represents no predictions from model.
770
+ iou_type (str):
771
+ mode (str): either "2D" or "3D"
772
+ class_names (None or list[str]): if provided, will use it to predict
773
+ per-category AP.
774
+ Returns:
775
+ a dict of {metric name: score}
776
+ """
777
+ assert mode in ["2D", "3D"]
778
+
779
+ metrics = {
780
+ "2D": ["AP", "AP50", "AP75", "AP95", "APs", "APm", "APl"],
781
+ "3D": ["AP", "AP15", "AP25", "AP50", "APn", "APm", "APf"],
782
+ }[mode]
783
+
784
+ if iou_type != "bbox":
785
+ raise ValueError("Support only for bbox evaluation.")
786
+
787
+ if omni_eval is None:
788
+ self._logger.warn("No predictions from the model!")
789
+ return {metric: float("nan") for metric in metrics}
790
+
791
+ # the standard metrics
792
+ results = {
793
+ metric: float(
794
+ omni_eval.stats[idx] * 100 if omni_eval.stats[idx] >= 0 else "nan"
795
+ )
796
+ for idx, metric in enumerate(metrics)
797
+ }
798
+ self._logger.info(
799
+ "Evaluation results for {} in {} mode: \n".format(iou_type, mode)
800
+ + create_small_table(results)
801
+ )
802
+ if not np.isfinite(sum(results.values())):
803
+ self._logger.info("Some metrics cannot be computed and is shown as NaN.")
804
+
805
+ if class_names is None or len(class_names) <= 1:
806
+ return results
807
+
808
+ # Compute per-category AP
809
+ # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
810
+ precisions = omni_eval.eval["precision"]
811
+
812
+ # precision has dims (iou, recall, cls, area range, max dets)
813
+ assert len(class_names) == precisions.shape[2]
814
+
815
+ results_per_category = []
816
+ for idx, name in enumerate(class_names):
817
+ # area range index 0: all area ranges
818
+ # max dets index -1: typically 100 per image
819
+ precision = precisions[:, :, idx, 0, -1]
820
+ precision = precision[precision > -1]
821
+ ap = np.mean(precision) if precision.size else float("nan")
822
+ results_per_category.append(("{}".format(name), float(ap * 100)))
823
+
824
+ # tabulate it
825
+ N_COLS = min(6, len(results_per_category) * 2)
826
+ results_flatten = list(itertools.chain(*results_per_category))
827
+ results_table = itertools.zip_longest(
828
+ *[results_flatten[i::N_COLS] for i in range(N_COLS)]
829
+ )
830
+ table = tabulate(
831
+ results_table,
832
+ tablefmt="pipe",
833
+ floatfmt=".3f",
834
+ headers=["category", "AP"] * (N_COLS // 2),
835
+ numalign="left",
836
+ )
837
+ self._logger.info(
838
+ "Per-category {} AP in {} mode: \n".format(iou_type, mode) + table
839
+ )
840
+ results.update({"AP-" + name: ap for name, ap in results_per_category})
841
+ return results
842
+
843
+ def _eval_predictions(self, predictions, img_ids=None):
844
+ """
845
+ Evaluate predictions. Fill self._results with the metrics of the tasks.
846
+ """
847
+ self._logger.info("Preparing results for COCO format ...")
848
+ omni_results = list(itertools.chain(*[x["instances"] for x in predictions]))
849
+ tasks = self._tasks or self._tasks_from_predictions(omni_results)
850
+
851
+ omni3d_global_categories = MetadataCatalog.get('omni3d_model').thing_classes
852
+
853
+ # the dataset results will store only the categories that are present
854
+ # in the corresponding dataset, all others will be dropped.
855
+ dataset_results = []
856
+
857
+ # unmap the category ids for COCO
858
+ if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
859
+ dataset_id_to_contiguous_id = (
860
+ self._metadata.thing_dataset_id_to_contiguous_id
861
+ )
862
+ all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
863
+ num_classes = len(all_contiguous_ids)
864
+ assert (
865
+ min(all_contiguous_ids) == 0
866
+ and max(all_contiguous_ids) == num_classes - 1
867
+ )
868
+
869
+ reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
870
+ for result in omni_results:
871
+ category_id = result["category_id"]
872
+ assert category_id < num_classes, (
873
+ f"A prediction has class={category_id}, "
874
+ f"but the dataset only has {num_classes} classes and "
875
+ f"predicted class id should be in [0, {num_classes - 1}]."
876
+ )
877
+ result["category_id"] = reverse_id_mapping[category_id]
878
+
879
+ cat_name = omni3d_global_categories[category_id]
880
+
881
+ if cat_name in self._metadata.thing_classes:
882
+ dataset_results.append(result)
883
+
884
+ # replace the results with the filtered
885
+ # instances that are in vocabulary.
886
+ omni_results = dataset_results
887
+
888
+ if self._output_dir:
889
+ file_path = os.path.join(self._output_dir, "omni_instances_results.json")
890
+ self._logger.info("Saving results to {}".format(file_path))
891
+ with PathManager.open(file_path, "w") as f:
892
+ f.write(json.dumps(omni_results))
893
+ f.flush()
894
+
895
+ if not self._do_evaluation:
896
+ self._logger.info("Annotations are not available for evaluation.")
897
+ return
898
+
899
+ self._logger.info(
900
+ "Evaluating predictions with {} COCO API...".format(
901
+ "unofficial" if self._use_fast_impl else "official"
902
+ )
903
+ )
904
+ for task in sorted(tasks):
905
+ assert task in {"bbox"}, f"Got unknown task: {task}!"
906
+ evals, log_strs = (
907
+ _evaluate_predictions_on_omni(
908
+ self._omni_api,
909
+ omni_results,
910
+ task,
911
+ img_ids=img_ids,
912
+ only_2d=self._only_2d,
913
+ eval_prox=self._eval_prox,
914
+ )
915
+ if len(omni_results) > 0
916
+ else None # cocoapi does not handle empty results very well
917
+ )
918
+
919
+ modes = evals.keys()
920
+ for mode in modes:
921
+ res = self._derive_omni_results(
922
+ evals[mode],
923
+ task,
924
+ mode,
925
+ class_names=self._metadata.get("thing_classes"),
926
+ )
927
+ self._results[task + "_" + format(mode)] = res
928
+ self._results[task + "_" + format(mode) + '_evalImgs'] = evals[mode].evalImgs
929
+ self._results[task + "_" + format(mode) + '_evals_per_cat_area'] = evals[mode].evals_per_cat_area
930
+
931
+ self._results["log_str_2D"] = log_strs["2D"]
932
+
933
+ if "3D" in log_strs:
934
+ self._results["log_str_3D"] = log_strs["3D"]
935
+
936
+
937
+ def _evaluate_predictions_on_omni(
938
+ omni_gt,
939
+ omni_results,
940
+ iou_type,
941
+ img_ids=None,
942
+ only_2d=False,
943
+ eval_prox=False,
944
+ ):
945
+ """
946
+ Evaluate the coco results using COCOEval API.
947
+ """
948
+ assert len(omni_results) > 0
949
+ log_strs, evals = {}, {}
950
+
951
+ omni_dt = omni_gt.loadRes(omni_results)
952
+
953
+ modes = ["2D"] if only_2d else ["2D", "3D"]
954
+
955
+ for mode in modes:
956
+ omni_eval = Omni3Deval(
957
+ omni_gt, omni_dt, iouType=iou_type, mode=mode, eval_prox=eval_prox
958
+ )
959
+ if img_ids is not None:
960
+ omni_eval.params.imgIds = img_ids
961
+
962
+ omni_eval.evaluate()
963
+ omni_eval.accumulate()
964
+ log_str = omni_eval.summarize()
965
+ log_strs[mode] = log_str
966
+ evals[mode] = omni_eval
967
+
968
+ return evals, log_strs
969
+
970
+
971
+ def instances_to_coco_json(instances, img_id):
972
+
973
+ num_instances = len(instances)
974
+
975
+ if num_instances == 0:
976
+ return []
977
+
978
+ boxes = BoxMode.convert(
979
+ instances.pred_boxes.tensor.numpy(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
980
+ ).tolist()
981
+ scores = instances.scores.tolist()
982
+ classes = instances.pred_classes.tolist()
983
+
984
+ if hasattr(instances, "pred_bbox3D"):
985
+ bbox3D = instances.pred_bbox3D.tolist()
986
+ center_cam = instances.pred_center_cam.tolist()
987
+ center_2D = instances.pred_center_2D.tolist()
988
+ dimensions = instances.pred_dimensions.tolist()
989
+ pose = instances.pred_pose.tolist()
990
+ else:
991
+ # dummy
992
+ bbox3D = np.ones([num_instances, 8, 3]).tolist()
993
+ center_cam = np.ones([num_instances, 3]).tolist()
994
+ center_2D = np.ones([num_instances, 2]).tolist()
995
+ dimensions = np.ones([num_instances, 3]).tolist()
996
+ pose = np.ones([num_instances, 3, 3]).tolist()
997
+
998
+ results = []
999
+ for k in range(num_instances):
1000
+ result = {
1001
+ "image_id": img_id,
1002
+ "category_id": classes[k],
1003
+ "bbox": boxes[k],
1004
+ "score": scores[k],
1005
+ "depth": np.array(bbox3D[k])[:, 2].mean(),
1006
+ "bbox3D": bbox3D[k],
1007
+ "center_cam": center_cam[k],
1008
+ "center_2D": center_2D[k],
1009
+ "dimensions": dimensions[k],
1010
+ "pose": pose[k],
1011
+ }
1012
+
1013
+ results.append(result)
1014
+ return results
1015
+
1016
+
1017
+ # ---------------------------------------------------------------------
1018
+ # Omni3DParams
1019
+ # ---------------------------------------------------------------------
1020
+ class Omni3DParams:
1021
+ """
1022
+ Params for the Omni evaluation API
1023
+ """
1024
+
1025
+ def setDet2DParams(self):
1026
+ self.imgIds = []
1027
+ self.catIds = []
1028
+
1029
+ # np.arange causes trouble. the data point on arange is slightly larger than the true value
1030
+ self.iouThrs = np.linspace(
1031
+ 0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True
1032
+ )
1033
+
1034
+ self.recThrs = np.linspace(
1035
+ 0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True
1036
+ )
1037
+
1038
+ self.maxDets = [1, 10, 100]
1039
+ self.areaRng = [
1040
+ [0 ** 2, 1e5 ** 2],
1041
+ [0 ** 2, 32 ** 2],
1042
+ [32 ** 2, 96 ** 2],
1043
+ [96 ** 2, 1e5 ** 2],
1044
+ ]
1045
+
1046
+ self.areaRngLbl = ["all", "small", "medium", "large"]
1047
+ self.useCats = 1
1048
+
1049
+ def setDet3DParams(self):
1050
+ self.imgIds = []
1051
+ self.catIds = []
1052
+
1053
+ # np.arange causes trouble. the data point on arange is slightly larger than the true value
1054
+ self.iouThrs = np.linspace(
1055
+ 0.05, 0.5, int(np.round((0.5 - 0.05) / 0.05)) + 1, endpoint=True
1056
+ )
1057
+
1058
+ self.recThrs = np.linspace(
1059
+ 0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True
1060
+ )
1061
+
1062
+ self.maxDets = [1, 10, 100]
1063
+ self.areaRng = [[0, 1e5], [0, 10], [10, 35], [35, 1e5]]
1064
+ self.areaRngLbl = ["all", "near", "medium", "far"]
1065
+ self.useCats = 1
1066
+
1067
+ def __init__(self, mode="2D"):
1068
+ """
1069
+ Args:
1070
+ iouType (str): defines 2D or 3D evaluation parameters.
1071
+ One of {"2D", "3D"}
1072
+ """
1073
+
1074
+ if mode == "2D":
1075
+ self.setDet2DParams()
1076
+
1077
+ elif mode == "3D":
1078
+ self.setDet3DParams()
1079
+
1080
+ else:
1081
+ raise Exception("mode %s not supported" % (mode))
1082
+
1083
+ self.iouType = "bbox"
1084
+ self.mode = mode
1085
+ # the proximity threshold defines the neighborhood
1086
+ # when evaluating on non-exhaustively annotated datasets
1087
+ self.proximity_thresh = 0.3
1088
+
1089
+
1090
+ # ---------------------------------------------------------------------
1091
+ # Omni3Deval
1092
+ # ---------------------------------------------------------------------
1093
+ class Omni3Deval(COCOeval):
1094
+ """
1095
+ Wraps COCOeval for 2D or 3D box evaluation depending on mode
1096
+ """
1097
+
1098
+ def __init__(
1099
+ self, cocoGt=None, cocoDt=None, iouType="bbox", mode="2D", eval_prox=False
1100
+ ):
1101
+ """
1102
+ Initialize COCOeval using coco APIs for Gt and Dt
1103
+ Args:
1104
+ cocoGt: COCO object with ground truth annotations
1105
+ cocoDt: COCO object with detection results
1106
+ iouType: (str) defines the evaluation type. Supports only "bbox" now.
1107
+ mode: (str) defines whether to evaluate 2D or 3D performance.
1108
+ One of {"2D", "3D"}
1109
+ eval_prox: (bool) if True, performs "Proximity Evaluation", i.e.
1110
+ evaluates detections in the proximity of the ground truth2D boxes.
1111
+ This is used for datasets which are not exhaustively annotated.
1112
+ """
1113
+ if not iouType:
1114
+ print("iouType not specified. use default iouType bbox")
1115
+ elif iouType != "bbox":
1116
+ print("no support for %s iouType" % (iouType))
1117
+ self.mode = mode
1118
+ if mode not in ["2D", "3D"]:
1119
+ raise Exception("mode %s not supported" % (mode))
1120
+ self.eval_prox = eval_prox
1121
+ self.cocoGt = cocoGt # ground truth COCO API
1122
+ self.cocoDt = cocoDt # detections COCO API
1123
+
1124
+ # per-image per-category evaluation results [KxAxI] elements
1125
+ self.evalImgs = defaultdict(list)
1126
+
1127
+ self.eval = {} # accumulated evaluation results
1128
+ self._gts = defaultdict(list) # gt for evaluation
1129
+ self._dts = defaultdict(list) # dt for evaluation
1130
+ self.params = Omni3DParams(mode) # parameters
1131
+ self._paramsEval = {} # parameters for evaluation
1132
+ self.stats = [] # result summarization
1133
+ self.ious = {} # ious between all gts and dts
1134
+
1135
+ if cocoGt is not None:
1136
+ self.params.imgIds = sorted(cocoGt.getImgIds())
1137
+ self.params.catIds = sorted(cocoGt.getCatIds())
1138
+
1139
+ self.evals_per_cat_area = None
1140
+
1141
+ def _prepare(self):
1142
+ """
1143
+ Prepare ._gts and ._dts for evaluation based on params
1144
+ """
1145
+
1146
+ p = self.params
1147
+
1148
+ if p.useCats:
1149
+ gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
1150
+ dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
1151
+
1152
+ else:
1153
+ gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
1154
+ dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
1155
+
1156
+ # set ignore flag
1157
+ ignore_flag = "ignore2D" if self.mode == "2D" else "ignore3D"
1158
+ for gt in gts:
1159
+ gt[ignore_flag] = gt[ignore_flag] if ignore_flag in gt else 0
1160
+
1161
+ self._gts = defaultdict(list) # gt for evaluation
1162
+ self._dts = defaultdict(list) # dt for evaluation
1163
+
1164
+ for gt in gts:
1165
+ self._gts[gt["image_id"], gt["category_id"]].append(gt)
1166
+
1167
+ for dt in dts:
1168
+ self._dts[dt["image_id"], dt["category_id"]].append(dt)
1169
+
1170
+ self.evalImgs = defaultdict(list) # per-image per-category evaluation results
1171
+ self.eval = {} # accumulated evaluation results
1172
+
1173
+ def accumulate(self, p = None):
1174
+ '''
1175
+ Accumulate per image evaluation results and store the result in self.eval
1176
+ :param p: input params for evaluation
1177
+ :return: None
1178
+ '''
1179
+
1180
+ print('Accumulating evaluation results...')
1181
+ assert self.evalImgs, 'Please run evaluate() first'
1182
+
1183
+ tic = time.time()
1184
+
1185
+ # allows input customized parameters
1186
+ if p is None:
1187
+ p = self.params
1188
+
1189
+ p.catIds = p.catIds if p.useCats == 1 else [-1]
1190
+
1191
+ T = len(p.iouThrs)
1192
+ R = len(p.recThrs)
1193
+ K = len(p.catIds) if p.useCats else 1
1194
+ A = len(p.areaRng)
1195
+ M = len(p.maxDets)
1196
+
1197
+ precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
1198
+ recall = -np.ones((T,K,A,M))
1199
+ scores = -np.ones((T,R,K,A,M))
1200
+
1201
+ # create dictionary for future indexing
1202
+ _pe = self._paramsEval
1203
+
1204
+ catIds = _pe.catIds if _pe.useCats else [-1]
1205
+ setK = set(catIds)
1206
+ setA = set(map(tuple, _pe.areaRng))
1207
+ setM = set(_pe.maxDets)
1208
+ setI = set(_pe.imgIds)
1209
+
1210
+ # get inds to evaluate
1211
+ catid_list = [k for n, k in enumerate(p.catIds) if k in setK]
1212
+ k_list = [n for n, k in enumerate(p.catIds) if k in setK]
1213
+ m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
1214
+ a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
1215
+ i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
1216
+
1217
+ I0 = len(_pe.imgIds)
1218
+ A0 = len(_pe.areaRng)
1219
+
1220
+ has_precomputed_evals = not (self.evals_per_cat_area is None)
1221
+
1222
+ if has_precomputed_evals:
1223
+ evals_per_cat_area = self.evals_per_cat_area
1224
+ else:
1225
+ evals_per_cat_area = {}
1226
+
1227
+ # retrieve E at each category, area range, and max number of detections
1228
+ for k, (k0, catId) in enumerate(zip(k_list, catid_list)):
1229
+ Nk = k0*A0*I0
1230
+ for a, a0 in enumerate(a_list):
1231
+ Na = a0*I0
1232
+
1233
+ if has_precomputed_evals:
1234
+ E = evals_per_cat_area[(catId, a)]
1235
+
1236
+ else:
1237
+ E = [self.evalImgs[Nk + Na + i] for i in i_list]
1238
+ E = [e for e in E if not e is None]
1239
+ evals_per_cat_area[(catId, a)] = E
1240
+
1241
+ if len(E) == 0:
1242
+ continue
1243
+
1244
+ for m, maxDet in enumerate(m_list):
1245
+
1246
+ dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
1247
+
1248
+ # different sorting method generates slightly different results.
1249
+ # mergesort is used to be consistent as Matlab implementation.
1250
+ inds = np.argsort(-dtScores, kind='mergesort')
1251
+ dtScoresSorted = dtScores[inds]
1252
+
1253
+ dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
1254
+ dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds]
1255
+ gtIg = np.concatenate([e['gtIgnore'] for e in E])
1256
+ npig = np.count_nonzero(gtIg==0)
1257
+
1258
+ if npig == 0:
1259
+ continue
1260
+
1261
+ tps = np.logical_and( dtm, np.logical_not(dtIg) )
1262
+ fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
1263
+
1264
+ tp_sum = np.cumsum(tps, axis=1).astype(dtype=float)
1265
+ fp_sum = np.cumsum(fps, axis=1).astype(dtype=float)
1266
+
1267
+ for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
1268
+ tp = np.array(tp)
1269
+ fp = np.array(fp)
1270
+ nd = len(tp)
1271
+ rc = tp / npig
1272
+ pr = tp / (fp+tp+np.spacing(1))
1273
+ q = np.zeros((R,))
1274
+ ss = np.zeros((R,))
1275
+
1276
+ if nd:
1277
+ recall[t,k,a,m] = rc[-1]
1278
+
1279
+ else:
1280
+ recall[t,k,a,m] = 0
1281
+
1282
+ # numpy is slow without cython optimization for accessing elements
1283
+ # use python array gets significant speed improvement
1284
+ pr = pr.tolist(); q = q.tolist()
1285
+
1286
+ for i in range(nd-1, 0, -1):
1287
+ if pr[i] > pr[i-1]:
1288
+ pr[i-1] = pr[i]
1289
+
1290
+ inds = np.searchsorted(rc, p.recThrs, side='left')
1291
+
1292
+ try:
1293
+ for ri, pi in enumerate(inds):
1294
+ q[ri] = pr[pi]
1295
+ ss[ri] = dtScoresSorted[pi]
1296
+ except:
1297
+ pass
1298
+
1299
+ precision[t,:,k,a,m] = np.array(q)
1300
+ scores[t,:,k,a,m] = np.array(ss)
1301
+
1302
+ self.evals_per_cat_area = evals_per_cat_area
1303
+
1304
+ self.eval = {
1305
+ 'params': p,
1306
+ 'counts': [T, R, K, A, M],
1307
+ 'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
1308
+ 'precision': precision,
1309
+ 'recall': recall,
1310
+ 'scores': scores,
1311
+ }
1312
+
1313
+ toc = time.time()
1314
+ print('DONE (t={:0.2f}s).'.format( toc-tic))
1315
+
1316
+ def evaluate(self):
1317
+ """
1318
+ Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
1319
+ """
1320
+
1321
+ print("Running per image evaluation...")
1322
+
1323
+ p = self.params
1324
+ print("Evaluate annotation type *{}*".format(p.iouType))
1325
+
1326
+ tic = time.time()
1327
+
1328
+ p.imgIds = list(np.unique(p.imgIds))
1329
+ if p.useCats:
1330
+ p.catIds = list(np.unique(p.catIds))
1331
+
1332
+ p.maxDets = sorted(p.maxDets)
1333
+ self.params = p
1334
+
1335
+ self._prepare()
1336
+
1337
+ catIds = p.catIds if p.useCats else [-1]
1338
+
1339
+ # loop through images, area range, max detection number
1340
+ self.ious = {
1341
+ (imgId, catId): self.computeIoU(imgId, catId)
1342
+ for imgId in p.imgIds
1343
+ for catId in catIds
1344
+ }
1345
+
1346
+ maxDet = p.maxDets[-1]
1347
+
1348
+ self.evalImgs = [
1349
+ self.evaluateImg(imgId, catId, areaRng, maxDet)
1350
+ for catId in catIds
1351
+ for areaRng in p.areaRng
1352
+ for imgId in p.imgIds
1353
+ ]
1354
+
1355
+ self._paramsEval = copy.deepcopy(self.params)
1356
+
1357
+ toc = time.time()
1358
+ print("DONE (t={:0.2f}s).".format(toc - tic))
1359
+
1360
+ def computeIoU(self, imgId, catId):
1361
+ """
1362
+ ComputeIoU computes the IoUs by sorting based on "score"
1363
+ for either 2D boxes (in 2D mode) or 3D boxes (in 3D mode)
1364
+ """
1365
+
1366
+ device = (torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu"))
1367
+
1368
+ p = self.params
1369
+ if p.useCats:
1370
+ gt = self._gts[imgId, catId]
1371
+ dt = self._dts[imgId, catId]
1372
+
1373
+ else:
1374
+ gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
1375
+ dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
1376
+
1377
+ if len(gt) == 0 and len(dt) == 0:
1378
+ return []
1379
+
1380
+ inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
1381
+ dt = [dt[i] for i in inds]
1382
+ if len(dt) > p.maxDets[-1]:
1383
+ dt = dt[0 : p.maxDets[-1]]
1384
+
1385
+ if p.iouType == "bbox":
1386
+ if self.mode == "2D":
1387
+ g = [g["bbox"] for g in gt]
1388
+ d = [d["bbox"] for d in dt]
1389
+ elif self.mode == "3D":
1390
+ g = [g["bbox3D"] for g in gt]
1391
+ d = [d["bbox3D"] for d in dt]
1392
+ else:
1393
+ raise Exception("unknown iouType for iou computation")
1394
+
1395
+ # compute iou between each dt and gt region
1396
+ # iscrowd is required in builtin maskUtils so we
1397
+ # use a dummy buffer for it
1398
+ iscrowd = [0 for o in gt]
1399
+ if self.mode == "2D":
1400
+ ious = maskUtils.iou(d, g, iscrowd)
1401
+
1402
+ elif len(d) > 0 and len(g) > 0:
1403
+
1404
+ # For 3D eval, we want to run IoU in CUDA if available
1405
+ if torch.cuda.is_available() and len(d) * len(g) < MAX_DTS_CROSS_GTS_FOR_IOU3D:
1406
+ device = torch.device("cuda:0")
1407
+ else:
1408
+ device = torch.device("cpu")
1409
+
1410
+ dd = torch.tensor(d, device=device, dtype=torch.float32)
1411
+ gg = torch.tensor(g, device=device, dtype=torch.float32)
1412
+
1413
+ ious = box3d_overlap(dd, gg).cpu().numpy()
1414
+
1415
+ else:
1416
+ ious = []
1417
+
1418
+ in_prox = None
1419
+
1420
+ if self.eval_prox:
1421
+ g = [g["bbox"] for g in gt]
1422
+ d = [d["bbox"] for d in dt]
1423
+ iscrowd = [0 for o in gt]
1424
+ ious2d = maskUtils.iou(d, g, iscrowd)
1425
+
1426
+ if type(ious2d) == list:
1427
+ in_prox = []
1428
+
1429
+ else:
1430
+ in_prox = ious2d > p.proximity_thresh
1431
+
1432
+ return ious, in_prox
1433
+
1434
+ def evaluateImg(self, imgId, catId, aRng, maxDet):
1435
+ """
1436
+ Perform evaluation for single category and image
1437
+ Returns:
1438
+ dict (single image results)
1439
+ """
1440
+
1441
+ p = self.params
1442
+ if p.useCats:
1443
+ gt = self._gts[imgId, catId]
1444
+ dt = self._dts[imgId, catId]
1445
+
1446
+ else:
1447
+ gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
1448
+ dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
1449
+
1450
+ if len(gt) == 0 and len(dt) == 0:
1451
+ return None
1452
+
1453
+ flag_range = "area" if self.mode == "2D" else "depth"
1454
+ flag_ignore = "ignore2D" if self.mode == "2D" else "ignore3D"
1455
+
1456
+ for g in gt:
1457
+ if g[flag_ignore] or (g[flag_range] < aRng[0] or g[flag_range] > aRng[1]):
1458
+ g["_ignore"] = 1
1459
+ else:
1460
+ g["_ignore"] = 0
1461
+
1462
+ # sort dt highest score first, sort gt ignore last
1463
+ gtind = np.argsort([g["_ignore"] for g in gt], kind="mergesort")
1464
+ gt = [gt[i] for i in gtind]
1465
+ dtind = np.argsort([-d["score"] for d in dt], kind="mergesort")
1466
+ dt = [dt[i] for i in dtind[0:maxDet]]
1467
+
1468
+ # load computed ious
1469
+ ious = (
1470
+ self.ious[imgId, catId][0][:, gtind]
1471
+ if len(self.ious[imgId, catId][0]) > 0
1472
+ else self.ious[imgId, catId][0]
1473
+ )
1474
+
1475
+ if self.eval_prox:
1476
+ in_prox = (
1477
+ self.ious[imgId, catId][1][:, gtind]
1478
+ if len(self.ious[imgId, catId][1]) > 0
1479
+ else self.ious[imgId, catId][1]
1480
+ )
1481
+
1482
+ T = len(p.iouThrs)
1483
+ G = len(gt)
1484
+ D = len(dt)
1485
+ gtm = np.zeros((T, G))
1486
+ dtm = np.zeros((T, D))
1487
+ gtIg = np.array([g["_ignore"] for g in gt])
1488
+ dtIg = np.zeros((T, D))
1489
+
1490
+ if not len(ious) == 0:
1491
+ for tind, t in enumerate(p.iouThrs):
1492
+ for dind, d in enumerate(dt):
1493
+
1494
+ # information about best match so far (m=-1 -> unmatched)
1495
+ iou = min([t, 1 - 1e-10])
1496
+ m = -1
1497
+
1498
+ for gind, g in enumerate(gt):
1499
+ # in case of proximity evaluation, if not in proximity continue
1500
+ if self.eval_prox and not in_prox[dind, gind]:
1501
+ continue
1502
+
1503
+ # if this gt already matched, continue
1504
+ if gtm[tind, gind] > 0:
1505
+ continue
1506
+
1507
+ # if dt matched to reg gt, and on ignore gt, stop
1508
+ if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
1509
+ break
1510
+
1511
+ # continue to next gt unless better match made
1512
+ if ious[dind, gind] < iou:
1513
+ continue
1514
+
1515
+ # if match successful and best so far, store appropriately
1516
+ iou = ious[dind, gind]
1517
+ m = gind
1518
+
1519
+ # if match made store id of match for both dt and gt
1520
+ if m == -1:
1521
+ continue
1522
+
1523
+ dtIg[tind, dind] = gtIg[m]
1524
+ dtm[tind, dind] = gt[m]["id"]
1525
+ gtm[tind, m] = d["id"]
1526
+
1527
+ # set unmatched detections outside of area range to ignore
1528
+ a = np.array(
1529
+ [d[flag_range] < aRng[0] or d[flag_range] > aRng[1] for d in dt]
1530
+ ).reshape((1, len(dt)))
1531
+
1532
+ dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, 0)))
1533
+
1534
+ # in case of proximity evaluation, ignore detections which are far from gt regions
1535
+ if self.eval_prox and len(in_prox) > 0:
1536
+ dt_far = in_prox.any(1) == 0
1537
+ dtIg = np.logical_or(dtIg, np.repeat(dt_far.reshape((1, len(dt))), T, 0))
1538
+
1539
+ # store results for given image and category
1540
+ return {
1541
+ "image_id": imgId,
1542
+ "category_id": catId,
1543
+ "aRng": aRng,
1544
+ "maxDet": maxDet,
1545
+ "dtIds": [d["id"] for d in dt],
1546
+ "gtIds": [g["id"] for g in gt],
1547
+ "dtMatches": dtm,
1548
+ "gtMatches": gtm,
1549
+ "dtScores": [d["score"] for d in dt],
1550
+ "gtIgnore": gtIg,
1551
+ "dtIgnore": dtIg,
1552
+ }
1553
+
1554
+ def summarize(self):
1555
+ """
1556
+ Compute and display summary metrics for evaluation results.
1557
+ Note this functin can *only* be applied on the default parameter setting
1558
+ """
1559
+
1560
+ def _summarize(mode, ap=1, iouThr=None, areaRng="all", maxDets=100, log_str=""):
1561
+ p = self.params
1562
+ eval = self.eval
1563
+
1564
+ if mode == "2D":
1565
+ iStr = (" {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}")
1566
+
1567
+ elif mode == "3D":
1568
+ iStr = " {:<18} {} @[ IoU={:<9} | depth={:>6s} | maxDets={:>3d} ] = {:0.3f}"
1569
+
1570
+ titleStr = "Average Precision" if ap == 1 else "Average Recall"
1571
+ typeStr = "(AP)" if ap == 1 else "(AR)"
1572
+
1573
+ iouStr = (
1574
+ "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
1575
+ if iouThr is None
1576
+ else "{:0.2f}".format(iouThr)
1577
+ )
1578
+
1579
+ aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
1580
+ mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
1581
+
1582
+ if ap == 1:
1583
+
1584
+ # dimension of precision: [TxRxKxAxM]
1585
+ s = eval["precision"]
1586
+
1587
+ # IoU
1588
+ if iouThr is not None:
1589
+ t = np.where(np.isclose(iouThr, p.iouThrs.astype(float)))[0]
1590
+ s = s[t]
1591
+
1592
+ s = s[:, :, :, aind, mind]
1593
+
1594
+ else:
1595
+ # dimension of recall: [TxKxAxM]
1596
+ s = eval["recall"]
1597
+ if iouThr is not None:
1598
+ t = np.where(iouThr == p.iouThrs)[0]
1599
+ s = s[t]
1600
+ s = s[:, :, aind, mind]
1601
+
1602
+ if len(s[s > -1]) == 0:
1603
+ mean_s = -1
1604
+
1605
+ else:
1606
+ mean_s = np.mean(s[s > -1])
1607
+
1608
+ if log_str != "":
1609
+ log_str += "\n"
1610
+
1611
+ log_str += "mode={} ".format(mode) + \
1612
+ iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)
1613
+
1614
+ return mean_s, log_str
1615
+
1616
+ def _summarizeDets(mode):
1617
+
1618
+ params = self.params
1619
+
1620
+ # the thresholds here, define the thresholds printed in `derive_omni_results`
1621
+ thres = [0.5, 0.75, 0.95] if mode == "2D" else [0.15, 0.25, 0.50]
1622
+
1623
+ stats = np.zeros((13,))
1624
+ stats[0], log_str = _summarize(mode, 1)
1625
+
1626
+ stats[1], log_str = _summarize(
1627
+ mode, 1, iouThr=thres[0], maxDets=params.maxDets[2], log_str=log_str
1628
+ )
1629
+
1630
+ stats[2], log_str = _summarize(
1631
+ mode, 1, iouThr=thres[1], maxDets=params.maxDets[2], log_str=log_str
1632
+ )
1633
+
1634
+ stats[3], log_str = _summarize(
1635
+ mode, 1, iouThr=thres[2], maxDets=params.maxDets[2], log_str=log_str
1636
+ )
1637
+
1638
+ stats[4], log_str = _summarize(
1639
+ mode,
1640
+ 1,
1641
+ areaRng=params.areaRngLbl[1],
1642
+ maxDets=params.maxDets[2],
1643
+ log_str=log_str,
1644
+ )
1645
+
1646
+ stats[5], log_str = _summarize(
1647
+ mode,
1648
+ 1,
1649
+ areaRng=params.areaRngLbl[2],
1650
+ maxDets=params.maxDets[2],
1651
+ log_str=log_str,
1652
+ )
1653
+
1654
+ stats[6], log_str = _summarize(
1655
+ mode,
1656
+ 1,
1657
+ areaRng=params.areaRngLbl[3],
1658
+ maxDets=params.maxDets[2],
1659
+ log_str=log_str,
1660
+ )
1661
+
1662
+ stats[7], log_str = _summarize(
1663
+ mode, 0, maxDets=params.maxDets[0], log_str=log_str
1664
+ )
1665
+
1666
+ stats[8], log_str = _summarize(
1667
+ mode, 0, maxDets=params.maxDets[1], log_str=log_str
1668
+ )
1669
+
1670
+ stats[9], log_str = _summarize(
1671
+ mode, 0, maxDets=params.maxDets[2], log_str=log_str
1672
+ )
1673
+
1674
+ stats[10], log_str = _summarize(
1675
+ mode,
1676
+ 0,
1677
+ areaRng=params.areaRngLbl[1],
1678
+ maxDets=params.maxDets[2],
1679
+ log_str=log_str,
1680
+ )
1681
+
1682
+ stats[11], log_str = _summarize(
1683
+ mode,
1684
+ 0,
1685
+ areaRng=params.areaRngLbl[2],
1686
+ maxDets=params.maxDets[2],
1687
+ log_str=log_str,
1688
+ )
1689
+
1690
+ stats[12], log_str = _summarize(
1691
+ mode,
1692
+ 0,
1693
+ areaRng=params.areaRngLbl[3],
1694
+ maxDets=params.maxDets[2],
1695
+ log_str=log_str,
1696
+ )
1697
+
1698
+ return stats, log_str
1699
+
1700
+ if not self.eval:
1701
+ raise Exception("Please run accumulate() first")
1702
+
1703
+ stats, log_str = _summarizeDets(self.mode)
1704
+ self.stats = stats
1705
+
1706
+ return log_str
cubercnn/modeling/backbone/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .densenet import *
2
+ from .mnasnet import *
3
+ from .resnet import *
4
+ from .shufflenet import *
5
+ from .dla import *
cubercnn/modeling/backbone/densenet.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ from torchvision import models
3
+ from detectron2.layers import ShapeSpec
4
+ from detectron2.modeling.backbone import Backbone
5
+ from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
6
+ import torch.nn.functional as F
7
+
8
+ from detectron2.modeling.backbone.fpn import FPN
9
+
10
+ class DenseNetBackbone(Backbone):
11
+ def __init__(self, cfg, input_shape, pretrained=True):
12
+ super().__init__()
13
+
14
+ base = models.densenet121(pretrained)
15
+ base = base.features
16
+
17
+ self.base = base
18
+
19
+ self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 1024, 'p6': 1024}
20
+ self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
21
+ self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
22
+
23
+ def forward(self, x):
24
+
25
+ outputs = {}
26
+
27
+ db1 = self.base[0:5](x)
28
+ db2 = self.base[5:7](db1)
29
+ db3 = self.base[7:9](db2)
30
+ p5 = self.base[9:](db3)
31
+ p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
32
+ outputs['p2'] = db1
33
+ outputs['p3'] = db2
34
+ outputs['p4'] = db3
35
+ outputs['p5'] = p5
36
+ outputs['p6'] = p6
37
+
38
+ return outputs
39
+
40
+
41
+ @BACKBONE_REGISTRY.register()
42
+ def build_densenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
43
+ """
44
+ Args:
45
+ cfg: a detectron2 CfgNode
46
+
47
+ Returns:
48
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
49
+ """
50
+
51
+ imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
52
+
53
+ bottom_up = DenseNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
54
+ in_features = cfg.MODEL.FPN.IN_FEATURES
55
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
56
+
57
+ backbone = FPN(
58
+ bottom_up=bottom_up,
59
+ in_features=in_features,
60
+ out_channels=out_channels,
61
+ norm=cfg.MODEL.FPN.NORM,
62
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE
63
+ )
64
+ return backbone
cubercnn/modeling/backbone/dla.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ import os
3
+ import math
4
+
5
+ import numpy as np
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.utils.model_zoo as model_zoo
9
+ import torch.nn.functional as F
10
+ import detectron2.utils.comm as comm
11
+
12
+ from detectron2.layers import ShapeSpec
13
+ from detectron2.modeling.backbone import Backbone
14
+ from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
15
+ from detectron2.modeling.backbone.fpn import FPN
16
+
17
+ BatchNorm = nn.BatchNorm2d
18
+
19
+ """
20
+ Adapted models from repositories
21
+ Deep Layer Aggregation CVPR 2018
22
+ https://github.com/ucbdrive/dla
23
+ BSD-3 Licence https://github.com/ucbdrive/dla/blob/master/LICENSE
24
+
25
+ Geometry Uncertainty Projection Network for Monocular 3D Object Detection, ICCV 2021
26
+ https://github.com/SuperMHP/GUPNet/blob/main/code/lib/backbones/dla.py
27
+ MIT Licence https://github.com/SuperMHP/GUPNet/blob/main/LICENSE
28
+ """
29
+
30
+ def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
31
+ return os.path.join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
32
+
33
+
34
+ def conv3x3(in_planes, out_planes, stride=1):
35
+ "3x3 convolution with padding"
36
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
37
+ padding=1, bias=False)
38
+
39
+
40
+ class BasicBlock(nn.Module):
41
+ def __init__(self, inplanes, planes, stride=1, dilation=1):
42
+ super(BasicBlock, self).__init__()
43
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
44
+ stride=stride, padding=dilation,
45
+ bias=False, dilation=dilation)
46
+ self.bn1 = BatchNorm(planes)
47
+ self.relu = nn.ReLU(inplace=True)
48
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
49
+ stride=1, padding=dilation,
50
+ bias=False, dilation=dilation)
51
+ self.bn2 = BatchNorm(planes)
52
+ self.stride = stride
53
+
54
+ def forward(self, x, residual=None):
55
+ if residual is None:
56
+ residual = x
57
+
58
+ out = self.conv1(x)
59
+ out = self.bn1(out)
60
+ out = self.relu(out)
61
+
62
+ out = self.conv2(out)
63
+ out = self.bn2(out)
64
+
65
+ out += residual
66
+ out = self.relu(out)
67
+
68
+ return out
69
+
70
+
71
+ class Bottleneck(nn.Module):
72
+ expansion = 2
73
+
74
+ def __init__(self, inplanes, planes, stride=1, dilation=1):
75
+ super(Bottleneck, self).__init__()
76
+ expansion = Bottleneck.expansion
77
+ bottle_planes = planes // expansion
78
+ self.conv1 = nn.Conv2d(inplanes, bottle_planes,
79
+ kernel_size=1, bias=False)
80
+ self.bn1 = BatchNorm(bottle_planes)
81
+ self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
82
+ stride=stride, padding=dilation,
83
+ bias=False, dilation=dilation)
84
+ self.bn2 = BatchNorm(bottle_planes)
85
+ self.conv3 = nn.Conv2d(bottle_planes, planes,
86
+ kernel_size=1, bias=False)
87
+ self.bn3 = BatchNorm(planes)
88
+ self.relu = nn.ReLU(inplace=True)
89
+ self.stride = stride
90
+
91
+ def forward(self, x, residual=None):
92
+ if residual is None:
93
+ residual = x
94
+
95
+ out = self.conv1(x)
96
+ out = self.bn1(out)
97
+ out = self.relu(out)
98
+
99
+ out = self.conv2(out)
100
+ out = self.bn2(out)
101
+ out = self.relu(out)
102
+
103
+ out = self.conv3(out)
104
+ out = self.bn3(out)
105
+
106
+ out += residual
107
+ out = self.relu(out)
108
+
109
+ return out
110
+
111
+
112
+ class BottleneckX(nn.Module):
113
+ expansion = 2
114
+ cardinality = 32
115
+
116
+ def __init__(self, inplanes, planes, stride=1, dilation=1):
117
+ super(BottleneckX, self).__init__()
118
+ cardinality = BottleneckX.cardinality
119
+ # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
120
+ # bottle_planes = dim * cardinality
121
+ bottle_planes = planes * cardinality // 32
122
+ self.conv1 = nn.Conv2d(inplanes, bottle_planes,
123
+ kernel_size=1, bias=False)
124
+ self.bn1 = BatchNorm(bottle_planes)
125
+ self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
126
+ stride=stride, padding=dilation, bias=False,
127
+ dilation=dilation, groups=cardinality)
128
+ self.bn2 = BatchNorm(bottle_planes)
129
+ self.conv3 = nn.Conv2d(bottle_planes, planes,
130
+ kernel_size=1, bias=False)
131
+ self.bn3 = BatchNorm(planes)
132
+ self.relu = nn.ReLU(inplace=True)
133
+ self.stride = stride
134
+
135
+ def forward(self, x, residual=None):
136
+ if residual is None:
137
+ residual = x
138
+
139
+ out = self.conv1(x)
140
+ out = self.bn1(out)
141
+ out = self.relu(out)
142
+
143
+ out = self.conv2(out)
144
+ out = self.bn2(out)
145
+ out = self.relu(out)
146
+
147
+ out = self.conv3(out)
148
+ out = self.bn3(out)
149
+
150
+ out += residual
151
+ out = self.relu(out)
152
+
153
+ return out
154
+
155
+
156
+ class Root(nn.Module):
157
+ def __init__(self, in_channels, out_channels, kernel_size, residual):
158
+ super(Root, self).__init__()
159
+ self.conv = nn.Conv2d(
160
+ in_channels, out_channels, 1,
161
+ stride=1, bias=False, padding=(kernel_size - 1) // 2)
162
+ self.bn = BatchNorm(out_channels)
163
+ self.relu = nn.ReLU(inplace=True)
164
+ self.residual = residual
165
+
166
+ def forward(self, *x):
167
+ children = x
168
+ x = self.conv(torch.cat(x, 1))
169
+ x = self.bn(x)
170
+ if self.residual:
171
+ x += children[0]
172
+ x = self.relu(x)
173
+
174
+ return x
175
+
176
+
177
+ class Tree(nn.Module):
178
+ def __init__(self, levels, block, in_channels, out_channels, stride=1,
179
+ level_root=False, root_dim=0, root_kernel_size=1,
180
+ dilation=1, root_residual=False):
181
+ super(Tree, self).__init__()
182
+ if root_dim == 0:
183
+ root_dim = 2 * out_channels
184
+ if level_root:
185
+ root_dim += in_channels
186
+ if levels == 1:
187
+ self.tree1 = block(in_channels, out_channels, stride,
188
+ dilation=dilation)
189
+ self.tree2 = block(out_channels, out_channels, 1,
190
+ dilation=dilation)
191
+ else:
192
+ self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
193
+ stride, root_dim=0,
194
+ root_kernel_size=root_kernel_size,
195
+ dilation=dilation, root_residual=root_residual)
196
+ self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
197
+ root_dim=root_dim + out_channels,
198
+ root_kernel_size=root_kernel_size,
199
+ dilation=dilation, root_residual=root_residual)
200
+ if levels == 1:
201
+ self.root = Root(root_dim, out_channels, root_kernel_size,
202
+ root_residual)
203
+ self.level_root = level_root
204
+ self.root_dim = root_dim
205
+ self.downsample = None
206
+ self.project = None
207
+ self.levels = levels
208
+ if stride > 1:
209
+ self.downsample = nn.MaxPool2d(stride, stride=stride)
210
+ if in_channels != out_channels:
211
+ self.project = nn.Sequential(
212
+ nn.Conv2d(in_channels, out_channels,
213
+ kernel_size=1, stride=1, bias=False),
214
+ BatchNorm(out_channels)
215
+ )
216
+
217
+ def forward(self, x, residual=None, children=None):
218
+ children = [] if children is None else children
219
+ bottom = self.downsample(x) if self.downsample else x
220
+ residual = self.project(bottom) if self.project else bottom
221
+ if self.level_root:
222
+ children.append(bottom)
223
+ x1 = self.tree1(x, residual)
224
+ if self.levels == 1:
225
+ x2 = self.tree2(x1)
226
+ x = self.root(x2, x1, *children)
227
+ else:
228
+ children.append(x1)
229
+ x = self.tree2(x1, children=children)
230
+ return x
231
+
232
+
233
+ class DLA(nn.Module):
234
+ def __init__(self, levels, channels, num_classes=1000,
235
+ block=BasicBlock, residual_root=False, return_levels=False,
236
+ pool_size=7, linear_root=False):
237
+ super(DLA, self).__init__()
238
+ self.channels = channels
239
+ self.return_levels = return_levels
240
+ self.num_classes = num_classes
241
+ self.base_layer = nn.Sequential(
242
+ nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
243
+ padding=3, bias=False),
244
+ BatchNorm(channels[0]),
245
+ nn.ReLU(inplace=True))
246
+ self.level0 = self._make_conv_level(
247
+ channels[0], channels[0], levels[0])
248
+ self.level1 = self._make_conv_level(
249
+ channels[0], channels[1], levels[1], stride=2)
250
+ self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
251
+ level_root=False,
252
+ root_residual=residual_root)
253
+ self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
254
+ level_root=True, root_residual=residual_root)
255
+ self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
256
+ level_root=True, root_residual=residual_root)
257
+ self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
258
+ level_root=True, root_residual=residual_root)
259
+
260
+ self.avgpool = nn.AvgPool2d(pool_size)
261
+
262
+ for m in self.modules():
263
+ if isinstance(m, nn.Conv2d):
264
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
265
+ m.weight.data.normal_(0, math.sqrt(2. / n))
266
+ elif isinstance(m, BatchNorm):
267
+ m.weight.data.fill_(1)
268
+ m.bias.data.zero_()
269
+
270
+ def _make_level(self, block, inplanes, planes, blocks, stride=1):
271
+ downsample = None
272
+ if stride != 1 or inplanes != planes:
273
+ downsample = nn.Sequential(
274
+ nn.MaxPool2d(stride, stride=stride),
275
+ nn.Conv2d(inplanes, planes,
276
+ kernel_size=1, stride=1, bias=False),
277
+ BatchNorm(planes),
278
+ )
279
+
280
+ layers = []
281
+ layers.append(block(inplanes, planes, stride, downsample=downsample))
282
+ for i in range(1, blocks):
283
+ layers.append(block(inplanes, planes))
284
+
285
+ return nn.Sequential(*layers)
286
+
287
+ def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
288
+ modules = []
289
+ for i in range(convs):
290
+ modules.extend([
291
+ nn.Conv2d(inplanes, planes, kernel_size=3,
292
+ stride=stride if i == 0 else 1,
293
+ padding=dilation, bias=False, dilation=dilation),
294
+ BatchNorm(planes),
295
+ nn.ReLU(inplace=True)])
296
+ inplanes = planes
297
+ return nn.Sequential(*modules)
298
+
299
+
300
+ def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
301
+
302
+ # load model only on main process
303
+ # to prevent redundent model caching
304
+ if comm.is_main_process():
305
+ model_url = get_model_url(data, name, hash)
306
+ model_weights = model_zoo.load_url(model_url)
307
+ del model_weights['fc.weight']
308
+ del model_weights['fc.bias']
309
+ self.load_state_dict(model_weights)
310
+
311
+
312
+ def dla34(pretrained=False, tricks=False, **kwargs): # DLA-34
313
+ model = DLA([1, 1, 1, 2, 2, 1],
314
+ [16, 32, 64, 128, 256, 512],
315
+ block=BasicBlock, **kwargs)
316
+ if pretrained:
317
+ if tricks:
318
+ model.load_pretrained_model(data='imagenet', name='dla34+tricks', hash='24a49e58')
319
+ else:
320
+ model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
321
+ return model
322
+
323
+
324
+ def dla46_c(pretrained=False, **kwargs): # DLA-46-C
325
+ Bottleneck.expansion = 2
326
+ model = DLA([1, 1, 1, 2, 2, 1],
327
+ [16, 32, 64, 64, 128, 256],
328
+ block=Bottleneck, **kwargs)
329
+ if pretrained:
330
+ model.load_pretrained_model(data='imagenet', name='dla46_c', hash='2bfd52c3')
331
+ return model
332
+
333
+
334
+ def dla46x_c(pretrained=False, **kwargs): # DLA-X-46-C
335
+ BottleneckX.expansion = 2
336
+ model = DLA([1, 1, 1, 2, 2, 1],
337
+ [16, 32, 64, 64, 128, 256],
338
+ block=BottleneckX, **kwargs)
339
+ if pretrained:
340
+ model.load_pretrained_model(data='imagenet', name='dla46x_c', hash='d761bae7')
341
+ return model
342
+
343
+
344
+ def dla60x_c(pretrained=False, **kwargs): # DLA-X-60-C
345
+ BottleneckX.expansion = 2
346
+ model = DLA([1, 1, 1, 2, 3, 1],
347
+ [16, 32, 64, 64, 128, 256],
348
+ block=BottleneckX, **kwargs)
349
+ if pretrained:
350
+ model.load_pretrained_model(data='imagenet', name='dla60x_c', hash='b870c45c')
351
+ return model
352
+
353
+
354
+ def dla60(pretrained=False, tricks=False, **kwargs): # DLA-60
355
+ Bottleneck.expansion = 2
356
+ model = DLA([1, 1, 1, 2, 3, 1],
357
+ [16, 32, 128, 256, 512, 1024],
358
+ block=Bottleneck, **kwargs)
359
+ if pretrained:
360
+ if tricks:
361
+ model.load_pretrained_model(data='imagenet', name='dla60+tricks', hash='14488826')
362
+ else:
363
+ model.load_pretrained_model(data='imagenet', name='dla60', hash='24839fc4')
364
+
365
+ return model
366
+
367
+
368
+ def dla60x(pretrained=False, **kwargs): # DLA-X-60
369
+ BottleneckX.expansion = 2
370
+ model = DLA([1, 1, 1, 2, 3, 1],
371
+ [16, 32, 128, 256, 512, 1024],
372
+ block=BottleneckX, **kwargs)
373
+ if pretrained:
374
+ model.load_pretrained_model(data='imagenet', name='dla60x', hash='d15cacda')
375
+ return model
376
+
377
+
378
+ def dla102(pretrained=False, tricks=False, **kwargs): # DLA-102
379
+ Bottleneck.expansion = 2
380
+ model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
381
+ block=Bottleneck, residual_root=True, **kwargs)
382
+ if pretrained:
383
+
384
+ if tricks:
385
+ model.load_pretrained_model(data='imagenet', name='dla102+tricks', hash='27a30eac')
386
+ else:
387
+ model.load_pretrained_model(data='imagenet', name='dla102', hash='d94d9790')
388
+ return model
389
+
390
+
391
+ def dla102x(pretrained=False, **kwargs): # DLA-X-102
392
+ BottleneckX.expansion = 2
393
+ model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
394
+ block=BottleneckX, residual_root=True, **kwargs)
395
+ if pretrained:
396
+ model.load_pretrained_model(data='imagenet', name='dla102x', hash='ad62be81')
397
+ return model
398
+
399
+
400
+ def dla102x2(pretrained=False, **kwargs): # DLA-X-102 64
401
+ BottleneckX.cardinality = 64
402
+ model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
403
+ block=BottleneckX, residual_root=True, **kwargs)
404
+ if pretrained:
405
+ model.load_pretrained_model(data='imagenet', name='dla102x2', hash='262837b6')
406
+ return model
407
+
408
+
409
+ def dla169(pretrained=False, **kwargs): # DLA-169
410
+ Bottleneck.expansion = 2
411
+ model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024],
412
+ block=Bottleneck, residual_root=True, **kwargs)
413
+ if pretrained:
414
+ model.load_pretrained_model(data='imagenet', name='dla169', hash='0914e092')
415
+ return model
416
+
417
+ class DLABackbone(Backbone):
418
+ def __init__(self, cfg, input_shape, pretrained=True):
419
+ super().__init__()
420
+
421
+ if cfg.MODEL.DLA.TYPE == 'dla34':
422
+ base = dla34(pretrained=pretrained, tricks=cfg.MODEL.DLA.TRICKS)
423
+ self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
424
+ elif cfg.MODEL.DLA.TYPE == 'dla46_c':
425
+ base = dla46_c(pretrained=pretrained)
426
+ self._out_feature_channels = {'p2': 64, 'p3': 64, 'p4': 128, 'p5': 256, 'p6': 256}
427
+ elif cfg.MODEL.DLA.TYPE == 'dla46x_c':
428
+ base = dla46x_c(pretrained=pretrained)
429
+ self._out_feature_channels = {'p2': 64, 'p3': 64, 'p4': 128, 'p5': 256, 'p6': 256}
430
+ elif cfg.MODEL.DLA.TYPE == 'dla60x_c':
431
+ base = dla60x_c(pretrained=pretrained)
432
+ self._out_feature_channels = {'p2': 64, 'p3': 64, 'p4': 128, 'p5': 256, 'p6': 256}
433
+ elif cfg.MODEL.DLA.TYPE == 'dla60':
434
+ base = dla60(pretrained=pretrained, tricks=cfg.MODEL.DLA.TRICKS)
435
+ self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
436
+ elif cfg.MODEL.DLA.TYPE == 'dla60x':
437
+ base = dla60x(pretrained=pretrained)
438
+ self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
439
+ elif cfg.MODEL.DLA.TYPE == 'dla102':
440
+ base = dla102(pretrained=pretrained, tricks=cfg.MODEL.DLA.TRICKS)
441
+ self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
442
+ elif cfg.MODEL.DLA.TYPE == 'dla102x':
443
+ base = dla102x(pretrained=pretrained)
444
+ self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
445
+ elif cfg.MODEL.DLA.TYPE == 'dla102x2':
446
+ base = dla102x2(pretrained=pretrained)
447
+ self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
448
+ elif cfg.MODEL.DLA.TYPE == 'dla169':
449
+ base = dla169(pretrained=pretrained)
450
+ self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
451
+
452
+ self.base_layer = base.base_layer
453
+ self.level0 = base.level0
454
+ self.level1 = base.level1
455
+ self.level2 = base.level2
456
+ self.level3 = base.level3
457
+ self.level4 = base.level4
458
+ self.level5 = base.level5
459
+
460
+ self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
461
+ self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
462
+
463
+ def forward(self, x):
464
+
465
+ outputs = {}
466
+
467
+ base_layer = self.base_layer(x)
468
+ level0 = self.level0(base_layer)
469
+ level1 = self.level1(level0)
470
+ level2 = self.level2(level1)
471
+ level3 = self.level3(level2)
472
+ level4 = self.level4(level3)
473
+ level5 = self.level5(level4)
474
+ level6 = F.max_pool2d(level5, kernel_size=1, stride=2, padding=0)
475
+
476
+ outputs['p2'] = level2
477
+ outputs['p3'] = level3
478
+ outputs['p4'] = level4
479
+ outputs['p5'] = level5
480
+ outputs['p6'] = level6
481
+
482
+ return outputs
483
+
484
+ @BACKBONE_REGISTRY.register()
485
+ def build_dla_from_vision_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
486
+ """
487
+ Args:
488
+ cfg: a detectron2 CfgNode
489
+
490
+ Returns:
491
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
492
+ """
493
+
494
+ imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
495
+
496
+ bottom_up = DLABackbone(cfg, input_shape, pretrained=imagenet_pretrain)
497
+ in_features = cfg.MODEL.FPN.IN_FEATURES
498
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
499
+
500
+ backbone = FPN(
501
+ bottom_up=bottom_up,
502
+ in_features=in_features,
503
+ out_channels=out_channels,
504
+ norm=cfg.MODEL.FPN.NORM,
505
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
506
+ )
507
+ return backbone
cubercnn/modeling/backbone/mnasnet.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ from torchvision import models
3
+ from detectron2.layers import ShapeSpec
4
+ from detectron2.modeling.backbone import Backbone
5
+ from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
6
+ import torch.nn.functional as F
7
+
8
+ from detectron2.modeling.backbone.fpn import FPN
9
+
10
+ class MNASNetBackbone(Backbone):
11
+ def __init__(self, cfg, input_shape, pretrained=True):
12
+ super().__init__()
13
+
14
+ base = models.mnasnet1_0(pretrained)
15
+ base = base.layers
16
+
17
+ self.base = base
18
+
19
+ self._out_feature_channels = {'p2': 24, 'p3': 40, 'p4': 96, 'p5': 320, 'p6': 320}
20
+ self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
21
+ self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
22
+
23
+ def forward(self, x):
24
+
25
+ outputs = {}
26
+
27
+ p2 = self.base[0:9](x)
28
+ p3 = self.base[9](p2)
29
+ p4 = self.base[10:12](p3)
30
+ p5 = self.base[12:14](p4)
31
+ p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
32
+ outputs['p2'] = p2
33
+ outputs['p3'] = p3
34
+ outputs['p4'] = p4
35
+ outputs['p5'] = p5
36
+ outputs['p6'] = p6
37
+
38
+ return outputs
39
+
40
+ @BACKBONE_REGISTRY.register()
41
+ def build_mnasnet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
42
+ """
43
+ Args:
44
+ cfg: a detectron2 CfgNode
45
+
46
+ Returns:
47
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
48
+ """
49
+
50
+ imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
51
+
52
+ bottom_up = MNASNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
53
+ in_features = cfg.MODEL.FPN.IN_FEATURES
54
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
55
+
56
+ backbone = FPN(
57
+ bottom_up=bottom_up,
58
+ in_features=in_features,
59
+ out_channels=out_channels,
60
+ norm=cfg.MODEL.FPN.NORM,
61
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
62
+ )
63
+ return backbone
cubercnn/modeling/backbone/resnet.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ from torchvision import models
3
+ from detectron2.layers import ShapeSpec
4
+ from detectron2.modeling.backbone import Backbone
5
+ from detectron2.modeling.backbone.fpn import LastLevelMaxPool
6
+ from detectron2.modeling.backbone.resnet import build_resnet_backbone
7
+ from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
8
+ import torch.nn.functional as F
9
+
10
+ from detectron2.modeling.backbone.fpn import FPN
11
+
12
+ class ResNet(Backbone):
13
+ def __init__(self, cfg, input_shape, pretrained=True):
14
+ super().__init__()
15
+
16
+ if cfg.MODEL.RESNETS.DEPTH == 18:
17
+ base = models.resnet18(pretrained)
18
+ self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
19
+ elif cfg.MODEL.RESNETS.DEPTH == 34:
20
+ base = models.resnet34(pretrained)
21
+ self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
22
+ elif cfg.MODEL.RESNETS.DEPTH == 50:
23
+ base = models.resnet50(pretrained)
24
+ self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048}
25
+ elif cfg.MODEL.RESNETS.DEPTH == 101:
26
+ base = models.resnet101(pretrained)
27
+ self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048}
28
+ else:
29
+ raise ValueError('No configuration currently supporting depth of {}'.format(cfg.MODEL.RESNETS.DEPTH))
30
+
31
+ self.conv1 = base.conv1
32
+ self.bn1 = base.bn1
33
+ self.relu = base.relu
34
+ self.maxpool = base.maxpool
35
+ self.layer1 = base.layer1
36
+ self.layer2 = base.layer2
37
+ self.layer3 = base.layer3
38
+ self.layer4 = base.layer4
39
+
40
+ self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
41
+ self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
42
+
43
+ def forward(self, x):
44
+
45
+ outputs = {}
46
+
47
+ x = self.conv1(x)
48
+ x = self.bn1(x)
49
+ x = self.relu(x)
50
+ x = self.maxpool(x)
51
+ p2 = self.layer1(x)
52
+ p3 = self.layer2(p2)
53
+ p4 = self.layer3(p3)
54
+ p5 = self.layer4(p4)
55
+ p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
56
+
57
+ outputs['p2'] = p2
58
+ outputs['p3'] = p3
59
+ outputs['p4'] = p4
60
+ outputs['p5'] = p5
61
+ outputs['p6'] = p6
62
+
63
+ return outputs
64
+
65
+
66
+ @BACKBONE_REGISTRY.register()
67
+ def build_resnet_from_vision_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
68
+ """
69
+ Args:
70
+ cfg: a detectron2 CfgNode
71
+
72
+ Returns:
73
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
74
+ """
75
+
76
+ imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
77
+
78
+ if cfg.MODEL.RESNETS.TORCHVISION:
79
+ bottom_up = ResNet(cfg, input_shape, pretrained=imagenet_pretrain)
80
+
81
+ else:
82
+ # use the MSRA modeling logic to build the backbone.
83
+ bottom_up = build_resnet_backbone(cfg, input_shape)
84
+
85
+ in_features = cfg.MODEL.FPN.IN_FEATURES
86
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
87
+
88
+ backbone = FPN(
89
+ bottom_up=bottom_up,
90
+ in_features=in_features,
91
+ out_channels=out_channels,
92
+ norm=cfg.MODEL.FPN.NORM,
93
+ top_block=LastLevelMaxPool(),
94
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
95
+ )
96
+ return backbone
cubercnn/modeling/backbone/shufflenet.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ from torchvision import models
3
+ from detectron2.layers import ShapeSpec
4
+ from detectron2.modeling.backbone import Backbone
5
+ from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
6
+ import torch.nn.functional as F
7
+
8
+ from detectron2.modeling.backbone.fpn import FPN
9
+
10
+ class ShufflenetBackbone(Backbone):
11
+ def __init__(self, cfg, input_shape, pretrained=True):
12
+ super().__init__()
13
+
14
+ base = models.shufflenet_v2_x1_0(pretrained)
15
+ self.conv1 = base.conv1
16
+ self.maxpool = base.maxpool
17
+ self.stage2 = base.stage2
18
+ self.stage3 = base.stage3
19
+ self.stage4 = base.stage4
20
+ self.conv5 = base.conv5
21
+
22
+ self._out_feature_channels = {'p2': 24, 'p3': 116, 'p4': 232, 'p5': 464, 'p6': 464}
23
+ self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
24
+ self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
25
+
26
+ def forward(self, x):
27
+
28
+ outputs = {}
29
+
30
+ x = self.conv1(x)
31
+ p2 = self.maxpool(x)
32
+ p3 = self.stage2(p2)
33
+ p4 = self.stage3(p3)
34
+ p5 = self.stage4(p4)
35
+ p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
36
+
37
+ outputs['p2'] = p2
38
+ outputs['p3'] = p3
39
+ outputs['p4'] = p4
40
+ outputs['p5'] = p5
41
+ outputs['p6'] = p6
42
+
43
+ return outputs
44
+
45
+
46
+ @BACKBONE_REGISTRY.register()
47
+ def build_shufflenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
48
+ """
49
+ Args:
50
+ cfg: a detectron2 CfgNode
51
+
52
+ Returns:
53
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
54
+ """
55
+
56
+ imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
57
+
58
+ bottom_up = ShufflenetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
59
+ in_features = cfg.MODEL.FPN.IN_FEATURES
60
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
61
+
62
+ backbone = FPN(
63
+ bottom_up=bottom_up,
64
+ in_features=in_features,
65
+ out_channels=out_channels,
66
+ norm=cfg.MODEL.FPN.NORM,
67
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
68
+ )
69
+ return backbone
cubercnn/modeling/meta_arch/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .rcnn3d import *
cubercnn/modeling/meta_arch/rcnn3d.py ADDED
@@ -0,0 +1,618 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ import logging
3
+ from typing import Dict, List, Optional
4
+ from detectron2.layers import move_device_like
5
+ from detectron2.structures.image_list import ImageList
6
+ import torch
7
+ import numpy as np
8
+ from detectron2.layers import ShapeSpec, batched_nms
9
+ from detectron2.utils.visualizer import Visualizer
10
+ from detectron2.data.detection_utils import convert_image_to_rgb
11
+ from detectron2.structures import Instances
12
+ from detectron2.utils.events import get_event_storage
13
+ from detectron2.data import MetadataCatalog
14
+
15
+ from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY
16
+ from detectron2.modeling.proposal_generator import build_proposal_generator
17
+ from detectron2.utils.logger import _log_api_usage
18
+ from detectron2.modeling.meta_arch import (
19
+ META_ARCH_REGISTRY, GeneralizedRCNN
20
+ )
21
+ # from cubercnn.data.generate_depth_maps import setup_depth_model
22
+ from cubercnn.modeling.roi_heads import build_roi_heads
23
+
24
+ from detectron2.data import MetadataCatalog
25
+ from cubercnn.modeling.roi_heads import build_roi_heads
26
+ from cubercnn import util, vis
27
+ import torch.nn.functional as F
28
+ from detectron2.config import configurable
29
+ import torch.nn as nn
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ @META_ARCH_REGISTRY.register()
35
+ class RCNN3D(GeneralizedRCNN):
36
+
37
+ @classmethod
38
+ def from_config(cls, cfg, priors=None):
39
+ backbone = build_backbone(cfg, priors=priors)
40
+ return {
41
+ "backbone": backbone,
42
+ "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
43
+ "roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors),
44
+ "input_format": cfg.INPUT.FORMAT,
45
+ "vis_period": cfg.VIS_PERIOD,
46
+ "pixel_mean": cfg.MODEL.PIXEL_MEAN,
47
+ "pixel_std": cfg.MODEL.PIXEL_STD,
48
+ }
49
+
50
+ def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
51
+
52
+ if not self.training:
53
+ return self.inference(batched_inputs)
54
+
55
+ images = self.preprocess_image(batched_inputs)
56
+
57
+ # scaling factor for the sample relative to its original scale
58
+ # e.g., how much has the image been upsampled by? or downsampled?
59
+ im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
60
+
61
+ # The unmodified intrinsics for the image
62
+ Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
63
+
64
+ if "instances" in batched_inputs[0]:
65
+ gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
66
+ else:
67
+ gt_instances = None
68
+
69
+ # the backbone is actually a FPN, where the DLA model is the bottom-up structure.
70
+ # FPN: https://arxiv.org/abs/1612.03144v2
71
+ # backbone and proposal generator only work on 2D images and annotations.
72
+ features = self.backbone(images.tensor)
73
+ proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
74
+
75
+ instances, detector_losses = self.roi_heads(
76
+ images, features, proposals,
77
+ Ks, im_scales_ratio,
78
+ gt_instances
79
+ )
80
+
81
+ if self.vis_period > 0:
82
+ storage = get_event_storage()
83
+ if storage.iter % self.vis_period == 0 and storage.iter > 0:
84
+ self.visualize_training(batched_inputs, proposals, instances)
85
+
86
+ losses = {}
87
+ losses.update(detector_losses)
88
+ losses.update(proposal_losses)
89
+ return losses
90
+
91
+ def inference(
92
+ self,
93
+ batched_inputs: List[Dict[str, torch.Tensor]],
94
+ detected_instances: Optional[List[Instances]] = None,
95
+ do_postprocess: bool = True,
96
+ ):
97
+ assert not self.training
98
+
99
+ images = self.preprocess_image(batched_inputs)
100
+
101
+ # scaling factor for the sample relative to its original scale
102
+ # e.g., how much has the image been upsampled by? or downsampled?
103
+ im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
104
+
105
+ # The unmodified intrinsics for the image
106
+ Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
107
+
108
+ features = self.backbone(images.tensor)
109
+
110
+ # Pass oracle 2D boxes into the RoI heads
111
+ if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
112
+ oracles = [b['oracle2D'] for b in batched_inputs]
113
+ results, _ = self.roi_heads(images, features, oracles, Ks, im_scales_ratio, None)
114
+
115
+ # normal inference
116
+ else:
117
+ proposals, _ = self.proposal_generator(images, features, None)
118
+ results, _ = self.roi_heads(images, features, proposals, Ks, im_scales_ratio, None)
119
+
120
+ if do_postprocess:
121
+ assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
122
+ return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
123
+ else:
124
+ return results
125
+
126
+ def visualize_training(self, batched_inputs, proposals, instances):
127
+ """
128
+ A function used to visualize images and proposals. It shows ground truth
129
+ bounding boxes on the original image and up to 20 top-scoring predicted
130
+ object proposals on the original image. Users can implement different
131
+ visualization functions for different models.
132
+ Args:
133
+ batched_inputs (list): a list that contains input to the model.
134
+ proposals (list): a list that contains predicted proposals. Both
135
+ batched_inputs and proposals should have the same length.
136
+ instances (list): a list that contains predicted RoIhead instances. Both
137
+ batched_inputs and proposals should have the same length.
138
+ """
139
+
140
+ storage = get_event_storage()
141
+
142
+ # minimum number of boxes to try to visualize per image
143
+ max_vis_prop = 20
144
+
145
+ if not hasattr(self, 'thing_classes'):
146
+ self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
147
+ self.num_classes = len(self.thing_classes)
148
+
149
+ # make a dummy for 2d scenario
150
+ only2d = instances is None
151
+ if only2d:
152
+ instances = [None]*len(batched_inputs)
153
+
154
+ for input, prop, instances_i in zip(batched_inputs, proposals, instances):
155
+
156
+ img = input["image"]
157
+ img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
158
+ img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
159
+ img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
160
+
161
+ '''
162
+ Visualize the 2D GT and proposal predictions
163
+ '''
164
+ v_gt = Visualizer(img, None)
165
+ v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
166
+ anno_img = v_gt.get_image()
167
+ box_size = min(len(prop.proposal_boxes), max_vis_prop)
168
+ v_pred = Visualizer(img, None)
169
+ v_pred = v_pred.overlay_instances(
170
+ boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
171
+ )
172
+ prop_img = v_pred.get_image()
173
+ vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
174
+ vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
175
+ storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
176
+ if only2d:
177
+ break
178
+ '''
179
+ Visualize the 3D GT and predictions
180
+ '''
181
+ K = torch.tensor(input['K'], device=self.device)
182
+ scale = input['height']/img.shape[0]
183
+ fx, sx = (val.item()/scale for val in K[0, [0, 2]])
184
+ fy, sy = (val.item()/scale for val in K[1, [1, 2]])
185
+
186
+ K_scaled = torch.tensor(
187
+ [[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]],
188
+ dtype=torch.float32, device=self.device
189
+ ) @ K
190
+
191
+ gts_per_image = input["instances"]
192
+
193
+ gt_classes = gts_per_image.gt_classes
194
+
195
+ # Filter out irrelevant groundtruth
196
+ fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)
197
+
198
+ gt_classes = gt_classes[fg_selection_mask]
199
+ gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
200
+ gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes
201
+ gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses
202
+
203
+ # projected 2D center, depth, w, h, l, 3D center
204
+ gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]
205
+
206
+ # this box may have been mirrored and scaled so
207
+ # we need to recompute XYZ in 3D by backprojecting.
208
+ gt_z = gt_boxes3D[:, 2]
209
+
210
+ gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
211
+ gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy
212
+
213
+ # put together the GT boxes
214
+ gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
215
+ gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)
216
+
217
+ gt_colors = torch.tensor(
218
+ [util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))],
219
+ device=self.device
220
+ )/255.0
221
+
222
+ gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)
223
+
224
+ # perform a simple NMS, which is not cls dependent.
225
+ keep = batched_nms(
226
+ instances_i.pred_boxes.tensor,
227
+ instances_i.scores,
228
+ torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device),
229
+ self.roi_heads.box_predictor.test_nms_thresh
230
+ )
231
+
232
+ keep = keep[:max_vis_prop]
233
+ num_to_visualize = len(keep)
234
+
235
+ pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
236
+ pred_pose = instances_i.pred_pose[keep]
237
+
238
+ pred_colors = torch.tensor(
239
+ [util.get_color(i) for i in range(num_to_visualize)],
240
+ device=self.device
241
+ )/255.0
242
+
243
+ pred_boxes = instances_i.pred_boxes[keep]
244
+ pred_scores = instances_i.scores[keep]
245
+ pred_classes = instances_i.pred_classes[keep]
246
+ pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
247
+ pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)
248
+
249
+ # convert to lists
250
+ pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
251
+ gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]
252
+
253
+ img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
254
+ img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
255
+
256
+ # horizontal stack 3D GT and pred left/right
257
+ vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
258
+ vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
259
+ vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)
260
+
261
+ storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)
262
+
263
+ break # only visualize one image in a batch
264
+
265
+ @META_ARCH_REGISTRY.register()
266
+ class RCNN3D_combined_features(nn.Module):
267
+
268
+ @configurable
269
+ def __init__(self, *, backbone, proposal_generator, roi_heads, input_format, vis_period, pixel_mean, pixel_std, depth_model, only_2d):
270
+ super().__init__()
271
+ self.backbone = backbone
272
+ self.proposal_generator = proposal_generator
273
+ self.roi_heads = roi_heads
274
+ self.input_format = input_format
275
+ self.vis_period = vis_period
276
+ self.depth_model = depth_model
277
+ self.only_2d = only_2d
278
+
279
+ self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
280
+ self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
281
+ assert (
282
+ self.pixel_mean.shape == self.pixel_std.shape
283
+ ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
284
+
285
+ @classmethod
286
+ def from_config(cls, cfg, priors=None):
287
+ backbone = build_backbone(cfg, priors=priors)
288
+ if False: # some leftover from experimenting with incorporating depth features
289
+ depth_model = 'zoedepth'
290
+ pretrained_resource = 'local::depth/checkpoints/depth_anything_metric_depth_indoor.pt'
291
+ d_model = setup_depth_model(depth_model, pretrained_resource) #NOTE maybe make the depth model be learnable as well
292
+
293
+ shape_modified = {key:ShapeSpec(i.channels*2,stride=i.stride) for key, i in backbone.output_shape().items()}
294
+ else:
295
+ d_model = None
296
+ shape_modified = backbone.output_shape()
297
+
298
+ return {
299
+ "backbone": backbone,
300
+ "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
301
+ "roi_heads": build_roi_heads(cfg, shape_modified, priors=priors),
302
+ "input_format": cfg.INPUT.FORMAT,
303
+ "vis_period": cfg.VIS_PERIOD,
304
+ "pixel_mean": cfg.MODEL.PIXEL_MEAN,
305
+ "pixel_std": cfg.MODEL.PIXEL_STD,
306
+ "depth_model": d_model,
307
+ "only_2d": cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D == 0.0,
308
+ }
309
+
310
+
311
+ @property
312
+ def device(self):
313
+ return self.pixel_mean.device
314
+
315
+ def _move_to_current_device(self, x):
316
+ return move_device_like(x, self.pixel_mean)
317
+
318
+
319
+ def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]], normalise=True, img_type="image", convert=False, NoOp=False, to_float=False):
320
+ """
321
+ Normalize, pad and batch the input images.
322
+ """
323
+ images = [self._move_to_current_device(x[img_type]) for x in batched_inputs]
324
+ if normalise:
325
+ images = [(x - self.pixel_mean) / self.pixel_std for x in images]
326
+ if convert:
327
+ # convert from BGR to RGB
328
+ images = [x[[2,1,0],:,:] for x in images]
329
+ if to_float:
330
+ images = [x.float()/255.0 for x in images]
331
+ if NoOp:
332
+ images = ImageList.from_tensors(images)
333
+ return images
334
+ images = ImageList.from_tensors(
335
+ images,
336
+ self.backbone.size_divisibility,
337
+ padding_constraints=self.backbone.padding_constraints,
338
+ )
339
+ return images
340
+
341
+ def _standardize(self, x:torch.Tensor, y:torch.Tensor):
342
+ '''standardise x to match the mean and std of y'''
343
+ ym = y.mean()
344
+ ys = y.std()
345
+ xm = x.mean()
346
+ xs = x.std()
347
+ return (x - xm) * (ys / xs) + ym
348
+
349
+ def cat_depth_features(self, features, images_raw):
350
+ pred_o = self.depth_model(images_raw.tensor.float()/255.0)
351
+ # depth features corresponding to p2, p3, p4, p5
352
+
353
+ d_features = pred_o['depth_features']
354
+ # img_features = features['p5']
355
+ # we must scale the depth map to the same size as the conv feature, otherwise the scale will not correspond correctly in the roi pooling
356
+ for (layer, img_feature), d_feature in zip(features.items(), reversed(d_features)):
357
+ d_feature = F.interpolate(d_feature, size=img_feature.shape[-2:], mode='bilinear', align_corners=True)
358
+ d_feature = self._standardize(d_feature, img_feature)
359
+ features[layer] = torch.cat((img_feature, d_feature), dim=1)
360
+ return features
361
+
362
+ def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
363
+
364
+ if not self.training:
365
+ return self.inference(batched_inputs) # segmentor is just none in inference because we dont need the loss
366
+
367
+ images = self.preprocess_image(batched_inputs)
368
+ # NOTE: images_raw are scaled to be padded to the same size as the largest.
369
+ # This is necessary because the images are of different sizes, so to batch them they must each be the same size.
370
+ images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
371
+ # if we want depth maps they are there
372
+ if not self.only_2d:
373
+ depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True)
374
+
375
+ ground_maps_fail = [i['ground_map'] is None for i in batched_inputs]
376
+ ground_maps_fail_idx = [i for i, x in enumerate(ground_maps_fail) if x]
377
+ for idx in ground_maps_fail_idx:
378
+ batched_inputs[idx]['ground_map'] = torch.tensor([[1]]) # make a dummy to indicate a fail
379
+ ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True)
380
+ else:
381
+ ground_maps = None
382
+ depth_maps = None
383
+
384
+ # scaling factor for the sample relative to its original scale
385
+ # e.g., how much has the image been upsampled by? or downsampled?
386
+ im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
387
+
388
+ # The unmodified intrinsics for the image
389
+ Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
390
+
391
+ if "instances" in batched_inputs[0]:
392
+ gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
393
+
394
+ features = self.backbone(images.tensor)
395
+ proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
396
+
397
+ if self.depth_model is not None:
398
+ features = self.cat_depth_features(features, images_raw)
399
+
400
+ instances, detector_losses = self.roi_heads(
401
+ images, images_raw, ground_maps, depth_maps, features, proposals,
402
+ Ks, im_scales_ratio,
403
+ gt_instances
404
+ )
405
+
406
+ if self.vis_period > 0:
407
+ storage = get_event_storage()
408
+ if storage.iter % self.vis_period == 0 and storage.iter > 0:
409
+ self.visualize_training(batched_inputs, proposals, instances)
410
+
411
+ losses = {}
412
+ losses.update(detector_losses)
413
+ losses.update(proposal_losses)
414
+ return losses
415
+
416
+ def inference(
417
+ self,
418
+ batched_inputs: List[Dict[str, torch.Tensor]],
419
+ detected_instances: Optional[List[Instances]] = None,
420
+ do_postprocess: bool = True,
421
+ ):
422
+ assert not self.training
423
+
424
+ images = self.preprocess_image(batched_inputs)
425
+ images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
426
+ # do we assume no access to ground maps in inference?
427
+ ground_maps = None
428
+ depth_maps = None
429
+
430
+ # scaling factor for the sample relative to its original scale
431
+ # e.g., how much has the image been upsampled by? or downsampled?
432
+ im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
433
+
434
+ # The unmodified intrinsics for the image
435
+ Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
436
+
437
+ features = self.backbone(images.tensor)
438
+
439
+ # Pass oracle 2D boxes into the RoI heads
440
+ if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
441
+ oracles = [b['oracle2D'] for b in batched_inputs]
442
+ results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, oracles, Ks, im_scales_ratio, None)
443
+
444
+ # normal inference
445
+ else:
446
+ proposals, _ = self.proposal_generator(images, features, None)
447
+ if self.depth_model is not None:
448
+ features = self.cat_depth_features(features, images_raw)
449
+ # pred boxes are proposals
450
+ results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, proposals, Ks, im_scales_ratio, None)
451
+
452
+ if do_postprocess:
453
+ assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
454
+ return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
455
+ else:
456
+ return results
457
+
458
+ def visualize_training(self, batched_inputs, proposals, instances):
459
+ """
460
+ A function used to visualize images and proposals. It shows ground truth
461
+ bounding boxes on the original image and up to 20 top-scoring predicted
462
+ object proposals on the original image. Users can implement different
463
+ visualization functions for different models.
464
+ Args:
465
+ batched_inputs (list): a list that contains input to the model.
466
+ proposals (list): a list that contains predicted proposals. Both
467
+ batched_inputs and proposals should have the same length.
468
+ instances (list): a list that contains predicted RoIhead instances. Both
469
+ batched_inputs and proposals should have the same length.
470
+ """
471
+
472
+ storage = get_event_storage()
473
+
474
+ # minimum number of boxes to try to visualize per image
475
+ max_vis_prop = 20
476
+
477
+ if not hasattr(self, 'thing_classes'):
478
+ self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
479
+ self.num_classes = len(self.thing_classes)
480
+ only2d = instances is None
481
+ if only2d:
482
+ instances = [None]*len(batched_inputs)
483
+ for input, prop, instances_i in zip(batched_inputs, proposals, instances):
484
+
485
+ img = input["image"]
486
+ img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
487
+ img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
488
+ img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
489
+
490
+ '''
491
+ Visualize the 2D GT and proposal predictions
492
+ '''
493
+ v_gt = Visualizer(img, None)
494
+ v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
495
+ anno_img = v_gt.get_image()
496
+ box_size = min(len(prop.proposal_boxes), max_vis_prop)
497
+ v_pred = Visualizer(img, None)
498
+ v_pred = v_pred.overlay_instances(
499
+ boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
500
+ )
501
+ prop_img = v_pred.get_image()
502
+ vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
503
+ vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
504
+ storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
505
+ if only2d:
506
+ break
507
+ '''
508
+ Visualize the 3D GT and predictions
509
+ '''
510
+ K = torch.tensor(input['K'], device=self.device)
511
+ scale = input['height']/img.shape[0]
512
+ fx, sx = (val.item()/scale for val in K[0, [0, 2]])
513
+ fy, sy = (val.item()/scale for val in K[1, [1, 2]])
514
+
515
+ K_scaled = torch.tensor(
516
+ [[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]],
517
+ dtype=torch.float32, device=self.device
518
+ ) @ K
519
+
520
+ gts_per_image = input["instances"]
521
+
522
+ gt_classes = gts_per_image.gt_classes
523
+
524
+ # Filter out irrelevant groundtruth
525
+ fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)
526
+
527
+ gt_classes = gt_classes[fg_selection_mask]
528
+ gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
529
+ gt_boxes = gts_per_image.gt_boxes.tensor[fg_selection_mask] # 2D boxes
530
+ gt_poses = gts_per_image.gt_poses[fg_selection_mask] # GT poses
531
+
532
+ # projected 2D center, depth, w, h, l, 3D center
533
+ gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]
534
+
535
+ # this box may have been mirrored and scaled so
536
+ # we need to recompute XYZ in 3D by backprojecting.
537
+ gt_z = gt_boxes3D[:, 2]
538
+
539
+ gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
540
+ gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy
541
+
542
+ # put together the GT boxes
543
+ gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
544
+ gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)
545
+
546
+ gt_colors = torch.tensor(
547
+ [util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))],
548
+ device=self.device
549
+ )/255.0
550
+
551
+ gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)
552
+
553
+ # perform a simple NMS, which is not cls dependent.
554
+ keep = batched_nms(
555
+ instances_i.pred_boxes.tensor,
556
+ instances_i.scores,
557
+ torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device),
558
+ self.roi_heads.box_predictor.test_nms_thresh
559
+ )
560
+
561
+ keep = keep[:max_vis_prop]
562
+ num_to_visualize = len(keep)
563
+
564
+ pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
565
+ pred_pose = instances_i.pred_pose[keep]
566
+
567
+ pred_colors = torch.tensor(
568
+ [util.get_color(i) for i in range(num_to_visualize)],
569
+ device=self.device
570
+ )/255.0
571
+
572
+ pred_boxes = instances_i.pred_boxes[keep]
573
+ pred_scores = instances_i.scores[keep]
574
+ pred_classes = instances_i.pred_classes[keep]
575
+ pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
576
+ pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)
577
+
578
+ # convert to lists
579
+ pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
580
+ gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]
581
+
582
+ img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
583
+ img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
584
+
585
+ # horizontal stack 3D GT and pred left/right
586
+ vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
587
+ vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
588
+ vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)
589
+
590
+ storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)
591
+
592
+ break # only visualize one image in a batch
593
+
594
+ def build_model(cfg, priors=None):
595
+ """
596
+ Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
597
+ Note that it does not load any weights from ``cfg``.
598
+ """
599
+ meta_arch = cfg.MODEL.META_ARCHITECTURE
600
+ model = META_ARCH_REGISTRY.get(meta_arch)(cfg, priors=priors)
601
+ model.to(torch.device(cfg.MODEL.DEVICE))
602
+ _log_api_usage("modeling.meta_arch." + meta_arch)
603
+ return model
604
+
605
+ def build_backbone(cfg, input_shape=None, priors=None):
606
+ """
607
+ Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
608
+
609
+ Returns:
610
+ an instance of :class:`Backbone`
611
+ """
612
+ if input_shape is None:
613
+ input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
614
+
615
+ backbone_name = cfg.MODEL.BACKBONE.NAME
616
+ backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape, priors)
617
+ assert isinstance(backbone, Backbone)
618
+ return backbone